Add time comment

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Merge branch 'main' into mamba_tests
2025-03-17 13:50:44 +00:00 · 2025-03-17 13:49:56 +00:00 · 2025-03-15 17:31:21 +00:00 · 2025-03-15 17:26:01 +00:00
15 changed files with 109 additions and 95 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -200,7 +200,6 @@ steps:
    - pytest -v -s v1/core
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/engine
-    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
@ -454,6 +453,15 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'

+- label: SSM and Hybrid Models Test # 12min
+  source_file_dependencies:
+      - vllm/
+      - tests/models/decoder_only/language/test_hybrid.py
+      - tests/models/decoder_only/language/test_mamba.py
+  commands:
+      - pytest -v -s models/decoder_only/language/test_hybrid.py
+      - pytest -v -s models/decoder_only/language/test_mamba.py
+
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
  optional: true
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
  * ✅︎
  * ✅︎
-  *
+  * ⚠️
 - * `GLM4VForCausalLM`<sup>^</sup>
  * GLM-4V
  * T + I
@ -948,11 +948,8 @@ V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
 - Generates reasonable outputs but does not match the original model's attention for text + image inputs
 - Will be updated in the future to support the correct behavior
- Does not support `"do_pan_and_scan": True`

 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
-For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::

 :::{note}
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -20,7 +20,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.5.4
+mistral-common==1.5.1
    # via -r requirements/test.in
 more-itertools==10.5.0
    # via lm-eval
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0

-import json
 import pickle

 import pytest
@ -209,6 +208,8 @@ def test_guided_decoding_backend_options():


 def test_pickle_xgrammar_tokenizer_data():
+
+    # TODO: move to another test file for xgrammar
    try:
        import xgrammar as xgr
    except ImportError:
@ -216,11 +217,7 @@ def test_pickle_xgrammar_tokenizer_data():

    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
        TokenizerData)
-    tokenizer_data = TokenizerData(
-        metadata=
-        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
-        encoded_vocab=['!', '"', '#', '$', '%'],
-    )
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
    pickled = pickle.dumps(tokenizer_data)

    assert pickled is not None
@ -228,5 +225,4 @@ def test_pickle_xgrammar_tokenizer_data():
    depickled: TokenizerData = pickle.loads(pickled)

    assert depickled is not None
-    assert json.loads(
-        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
+    assert depickled.vocab_type == xgr.VocabType.RAW
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -18,6 +18,9 @@ MODELS_TO_TEST = [
    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]

+# Undo after https://github.com/vllm-project/vllm/pull/14868
+pytest.skip(allow_module_level=True)
+

@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1191,7 +1191,7 @@ class EngineArgs:
        NOTE: for autoselection of V0 vs V1 engine, we need to
        create the ModelConfig first, since ModelConfig's attrs
        (e.g. the model arch) are needed to make the decision.
-
+        
        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
        unspecified by the user.

@ -1576,6 +1576,10 @@ class EngineArgs:
        #############################################################
        # Experimental Features - allow users to opt in.

+        # MLA is is supported on V1, but off by default for now.
+        if model_config.use_mla and _warn_or_fallback("MLA"):
+            return False
+
        # LoRA is supported on V1, but off by default for now.
        if self.enable_lora and _warn_or_fallback("LORA"):
            return False
--- a/vllm/model_executor/guided_decoding/init.py
+++ b/vllm/model_executor/guided_decoding/init.py
@ -9,6 +9,7 @@ from vllm.model_executor.guided_decoding.reasoner import get_reasoner
 from vllm.model_executor.guided_decoding.utils import (
    convert_lark_to_gbnf, grammar_is_likely_lark,
    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
+from vllm.platforms import CpuArchEnum

 if TYPE_CHECKING:
    from transformers import PreTrainedTokenizer
@ -25,7 +26,7 @@ def maybe_backend_fallback(

    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                          fallback: str) -> None:
-        """Change the backend to the specified fallback with a warning log,
+        """Change the backend to the specified fallback with a warning log, 
        or raise a ValueError if the `no-fallback` option is specified."""
        if guided_params.no_fallback():
            raise ValueError(message)
@ -52,12 +53,19 @@ def maybe_backend_fallback(
    if guided_params.backend_name == "xgrammar":
        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
            xgr_installed)
+        # xgrammar only has x86 wheels for linux, fallback to outlines
+        from vllm.platforms import current_platform
+        if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
+            fallback_or_error(guided_params,
+                              "xgrammar is only supported on x86 CPUs.",
+                              "outlines")

        # xgrammar doesn't support regex, fallback to outlines
        if guided_params.regex is not None:
            fallback_or_error(
                guided_params,
                "xgrammar does not support regex guided decoding.", "outlines")
+
        # xgrammar doesn't support some JSON schema features
        elif (guided_params.json is not None
              and has_xgrammar_unsupported_json_features(guided_params.json)):
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@ -9,11 +9,13 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, List

 import torch
+from transformers import PreTrainedTokenizerFast

 from vllm.logger import init_logger

 try:
    import xgrammar as xgr
+    from xgrammar.base import _core as xgr_core
    xgr_installed = True
 except ImportError:
    xgr_installed = False
@ -33,6 +35,7 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)


+# TODO: passing batch size to max threads here
 def get_local_xgrammar_guided_decoding_logits_processor(
        guided_params: GuidedDecodingParams,
        tokenizer: PreTrainedTokenizer,
@ -49,8 +52,18 @@ def get_local_xgrammar_guided_decoding_logits_processor(
@dataclass(frozen=True)
 class TokenizerData:
    """Immutable container for cached tokenizer data."""
-    metadata: str
    encoded_vocab: list[str] = field(default_factory=list)
+    stop_token_ids: list[int] | None = None
+    # These fields are mutually exclusive: `backend_str` is used to create a
+    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
+    # used within the constructor of TokenizeInfo
+    backend_str: str | None = None
+    vocab_type: xgr.VocabType | None = None
+
+    def __post_init__(self):
+        # Check for mutual exclusive
+        assert not (self.backend_str and self.vocab_type), \
+            "backend_str and vocab_type are mutual exclusive"


 class TokenizerDataCache:
@ -58,52 +71,46 @@ class TokenizerDataCache:
    _cache: dict[int, TokenizerData] = {}

    @classmethod
-    def get_tokenizer_data(
-        cls,
-        tokenizer: PreTrainedTokenizer,
-        /,
-        *,
-        tokenizer_hash: int,
-        vocab_size: int,
-    ) -> TokenizerData:
+    def get_tokenizer_data(cls,
+                           tokenizer: PreTrainedTokenizer) -> TokenizerData:
+        tokenizer_hash = hash(tokenizer)

        if tokenizer_hash not in cls._cache:
-            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-                tokenizer,
-                # NOTE: We will need to use lm_head's vocab_size
-                # to determine correct special_token_ids for this tokenizer.
-                # See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92  # noqa: E501
-                vocab_size=vocab_size,
-            )
-            metadata = json.loads(tokenizer_info.dump_metadata())
-
-            # Vendored from xgrammar logic to get encoded_vocab
-            # https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
+            # Vendored from xgrammar logic since we cannot pickle the tokenizer
+            # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
            try:
-                vocab_dict = tokenizer.get_vocab()
+                encoded_vocab = [
+                    token for token, _ in sorted(tokenizer.get_vocab().items(),
+                                                 key=lambda x: x[1])
+                ]
            except AttributeError as e:
                raise ValueError(
                    f"Cannot get the vocabulary of the tokenizer "
                    f"{type(tokenizer)}. The tokenizer should have a "
                    "get_vocab method.") from e

-            # maintain tokenizer's indexing
-            encoded_vocab = [""] * tokenizer_info.vocab_size
-            for token, idx in vocab_dict.items():
-                if idx < tokenizer_info.vocab_size:
-                    encoded_vocab[idx] = token
+            stop_token_ids = None
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW

-            if isinstance(tokenizer, MistralTokenizer):
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
+            if isinstance(tokenizer, PreTrainedTokenizerFast):
+                backend_str = tokenizer.backend_tokenizer.to_str()
+                vocab_type = None
+
+            elif isinstance(tokenizer, MistralTokenizer):
                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                metadata.update({
-                    "vocab_type": xgr.VocabType.BYTE_FALLBACK,
-                    "add_prefix_space": True
-                })
+                vocab_type = xgr.VocabType.BYTE_FALLBACK

            cls._cache[tokenizer_hash] = TokenizerData(
                encoded_vocab=encoded_vocab,
-                metadata=json.dumps(metadata),
-            )
+                stop_token_ids=stop_token_ids,
+                backend_str=backend_str,
+                vocab_type=vocab_type)

        return cls._cache[tokenizer_hash]

@ -122,15 +129,30 @@ class GrammarCompilerCache:
        cache_key = str(config.tokenizer_hash)

        if cache_key not in cls._cache:
+            assert config.tokenizer_data is not None
+            assert config.tokenizer_data.encoded_vocab is not None
+
            config_data = config.tokenizer_data

            # In TokenizerDataCache.get_tokenizer_data, a serializable
            # tokenizer_data is created and cached. This data is used to build
            # a tokenizer_info and create an xgrammar compiler.
-            tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
-                encoded_vocab=config_data.encoded_vocab,
-                metadata=config_data.metadata,
-            )
+            # - If tokenizer_data has backend_str set, use
+            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
+            # - Otherwise, use the default constructor with vocab_type.
+            # - xgr_core.TokenizerInfo.from_huggingface !=
+            #   xgr.TokenizerInfo.from_huggingface.
+            if config_data.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config_data.encoded_vocab, config_data.backend_str,
+                        config.vocab_size, config_data.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config_data.encoded_vocab,
+                    config_data.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config_data.stop_token_ids)
            cls._cache[cache_key] = xgr.GrammarCompiler(
                tokenizer_info, max_threads=config.max_threads)

@ -141,12 +163,13 @@ class GrammarCompilerCache:
 class GrammarConfig:
    """Serializable configuration for grammar compilation"""
    tokenizer_hash: int
-    tokenizer_data: TokenizerData
+    vocab_size: int
    json_str: str | None = None
    grammar_str: str | None = None
    json_object: bool | None = None
    any_whitespace: bool = True
    max_threads: int = 8
+    tokenizer_data: TokenizerData | None = None

    @classmethod
    def from_guided_params(cls,
@ -156,11 +179,7 @@ class GrammarConfig:
                           max_threads: int = 8) -> GrammarConfig:

        tokenizer_hash = hash(tokenizer)
-        tokenizer_data = TokenizerDataCache.get_tokenizer_data(
-            tokenizer,
-            tokenizer_hash=tokenizer_hash,
-            vocab_size=model_config.hf_text_config.vocab_size,
-        )
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)

        if guided_params.json:
            if not isinstance(guided_params.json, str):
@ -199,6 +218,7 @@ class GrammarConfig:
                raise ValueError(str(err)) from err

            return cls(json_str=json_str,
+                       vocab_size=model_config.hf_text_config.vocab_size,
                       tokenizer_hash=tokenizer_hash,
                       max_threads=max_threads,
                       tokenizer_data=tokenizer_data,
@ -226,12 +246,14 @@ class GrammarConfig:
                raise ValueError(str(err)) from err

            return cls(grammar_str=grammar_str,
+                       vocab_size=model_config.hf_text_config.vocab_size,
                       tokenizer_hash=tokenizer_hash,
                       max_threads=max_threads,
                       tokenizer_data=tokenizer_data)
        elif guided_params.json_object:
            return cls(
                json_object=True,
+                vocab_size=model_config.hf_text_config.vocab_size,
                tokenizer_hash=tokenizer_hash,
                max_threads=max_threads,
                tokenizer_data=tokenizer_data,
@ -245,6 +267,7 @@ class GrammarConfig:

            return cls(
                grammar_str=choice_str,
+                vocab_size=model_config.hf_text_config.vocab_size,
                tokenizer_hash=tokenizer_hash,
                max_threads=max_threads,
                tokenizer_data=tokenizer_data,
@ -268,13 +291,6 @@ class GrammarConfig:
        grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
        return grammar

-    @staticmethod
-    def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
-        return xgr.TokenizerInfo.from_vocab_and_metadata(
-            encoded_vocab=tokenizer_data.encoded_vocab,
-            metadata=tokenizer_data.metadata,
-        )
-

@dataclass
 class XGrammarLogitsProcessor:
@ -283,16 +299,11 @@ class XGrammarLogitsProcessor:
    reasoner: Reasoner | None = None

    ctx: xgr.CompiledGrammar | None = None
-    tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
    batch_size: int = field(default=1)
    prefilled: bool = field(default=False)

-    def __post_init__(self):
-        self.tokenizer_info = self.config.tokenizer_info(
-            self.config.tokenizer_data)
-
    def __getstate__(self) -> dict[str, Any]:
        return {'config': self.config, 'reasoner': self.reasoner}

@ -300,8 +311,6 @@ class XGrammarLogitsProcessor:
        self.config = state['config']
        self.reasoner = state['reasoner']

-        self.tokenizer_info = GrammarConfig.tokenizer_info(
-            self.config.tokenizer_data)
        self.ctx = None
        self.matchers = []
        self.batch_size = 1
@ -343,7 +352,7 @@ class XGrammarLogitsProcessor:
                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
            ]
            self.token_bitmask = xgr.allocate_token_bitmask(
-                self.batch_size, self.tokenizer_info.vocab_size)
+                self.batch_size, self.config.vocab_size)

        if not self.prefilled:
            # Have not sampled a token yet
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@ -38,8 +38,6 @@ from .utils import (is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)

-KVCache = Tuple[torch.Tensor, torch.Tensor]
-

 class BambaMLP(nn.Module):

--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors

 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP, SupportsV0Only)
+                         SupportsMultiModal, SupportsPP)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                    maybe_prefix, merge_multimodal_embeddings)
@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
                                        info=Gemma3ProcessingInfo,
                                        dummy_inputs=Gemma3DummyInputsBuilder)
 class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-                                     SupportsLoRA, SupportsV0Only):
+                                     SupportsLoRA):
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@ -111,7 +111,6 @@ class MixtralAttention(nn.Module):

    def __init__(
        self,
-        config: MixtralConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
@ -137,9 +136,7 @@ class MixtralAttention(nn.Module):
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        # MixtralConfig has an optional head_dim argument
-        self.head_dim = getattr(config, "head_dim",
-                                self.hidden_size // self.total_num_heads)
+        self.head_dim = hidden_size // self.total_num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
@ -203,7 +200,6 @@ class MixtralDecoderLayer(nn.Module):
        # Requires transformers > 4.32.0
        rope_theta = getattr(config, "rope_theta", 10000)
        self.self_attn = MixtralAttention(
-            config=config,
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            max_position=config.max_position_embeddings,
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@ -165,7 +165,6 @@ class MixtralAttention(nn.Module):

    def __init__(
        self,
-        config: MixtralConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
@ -191,9 +190,7 @@ class MixtralAttention(nn.Module):
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        # MixtralConfig has an optional head_dim argument
-        self.head_dim = getattr(config, "head_dim",
-                                self.hidden_size // self.total_num_heads)
+        self.head_dim = hidden_size // self.total_num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
@ -255,7 +252,6 @@ class MixtralDecoderLayer(nn.Module):
        # Requires transformers > 4.32.0
        rope_theta = getattr(config, "rope_theta", 10000)
        self.self_attn = MixtralAttention(
-            config=config,
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            max_position=config.max_position_embeddings,
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@ -73,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
    """
    A boolean mask indicating which image embeddings correspond
    to patch tokens.
-
+    
    Shape: `(batch_size, num_images, num_embeds)`
    """

@ -849,10 +849,10 @@ class VisionTransformer(nn.Module):
    ) -> torch.Tensor:
        """
        Args:
-            images: list of N_img images of variable sizes,
+            images: list of N_img images of variable sizes, 
                each of shape (C, H, W)
        Returns:
-            image_features: tensor of token features for
+            image_features: tensor of token features for 
                all tokens of all images of shape (N_toks, D)
        """
        # pass images through initial convolution independently
@ -935,8 +935,7 @@ class PatchMerger(nn.Module):
        # x is (N, vision_encoder_dim)
        x = self.permute(x, image_sizes)

-        # x is (N / spatial_merge_size ** 2,
-        #       vision_encoder_dim * spatial_merge_size ** 2)
+        # x is (N / spatial_merge_size ** 2, vision_encoder_dim * spatial_merge_size ** 2)
        x = self.merging_layer(x)

        # x is (N / spatial_merge_size ** 2, vision_encoder_dim)
--- a/vllm/v1/structured_output/init.py
+++ b/vllm/v1/structured_output/init.py
@ -40,7 +40,7 @@ class StructuredOutputManager:
        tokenizer_group.ping()

        tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.vocab_size = self.vllm_config.model_config.get_vocab_size()
+        self.vocab_size = len(tokenizer.get_vocab())
        if isinstance(tokenizer, MistralTokenizer):
            # NOTE: ideally, xgrammar should handle this accordingly.
            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
Author	SHA1	Message	Date
Tyler Michael Smith	031c8b32a4	Add time comment Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-03-17 13:50:44 +00:00
Tyler Michael Smith	ac08d45200	Merge branch 'main' into mamba_tests	2025-03-17 13:49:56 +00:00
Tyler Michael Smith	a5d29e9ee1	undo massive formatting change Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-03-15 17:31:21 +00:00
Tyler Michael Smith	696245c2fc	Add SSM and Hybrid Models Test Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-03-15 17:26:01 +00:00