Compare commits

..

4 Commits

Author SHA1 Message Date
031c8b32a4 Add time comment
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-17 13:50:44 +00:00
ac08d45200 Merge branch 'main' into mamba_tests 2025-03-17 13:49:56 +00:00
a5d29e9ee1 undo massive formatting change
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-15 17:31:21 +00:00
696245c2fc Add SSM and Hybrid Models Test
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-15 17:26:01 +00:00
15 changed files with 109 additions and 95 deletions

View File

@ -200,7 +200,6 @@ steps:
- pytest -v -s v1/core
- pytest -v -s v1/entrypoints
- pytest -v -s v1/engine
- pytest -v -s v1/entrypoints
- pytest -v -s v1/sample
- pytest -v -s v1/worker
- pytest -v -s v1/structured_output
@ -454,6 +453,15 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
- label: SSM and Hybrid Models Test # 12min
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language/test_hybrid.py
- tests/models/decoder_only/language/test_mamba.py
commands:
- pytest -v -s models/decoder_only/language/test_hybrid.py
- pytest -v -s models/decoder_only/language/test_mamba.py
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true

View File

@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
* ✅︎
* ✅︎
*
* ⚠️
- * `GLM4VForCausalLM`<sup>^</sup>
* GLM-4V
* T + I
@ -948,11 +948,8 @@ V1 currently uses a simplified attention pattern:
- Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
- Will be updated in the future to support the correct behavior
- Does not support `"do_pan_and_scan": True`
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
:::
:::{note}

View File

@ -20,7 +20,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.11, < 0.11
outlines == 0.1.11
lark == 1.2.2
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs

View File

@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.5.4
mistral-common==1.5.1
# via -r requirements/test.in
more-itertools==10.5.0
# via lm-eval

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
import json
import pickle
import pytest
@ -209,6 +208,8 @@ def test_guided_decoding_backend_options():
def test_pickle_xgrammar_tokenizer_data():
# TODO: move to another test file for xgrammar
try:
import xgrammar as xgr
except ImportError:
@ -216,11 +217,7 @@ def test_pickle_xgrammar_tokenizer_data():
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
TokenizerData)
tokenizer_data = TokenizerData(
metadata=
'{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
encoded_vocab=['!', '"', '#', '$', '%'],
)
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
pickled = pickle.dumps(tokenizer_data)
assert pickled is not None
@ -228,5 +225,4 @@ def test_pickle_xgrammar_tokenizer_data():
depickled: TokenizerData = pickle.loads(pickled)
assert depickled is not None
assert json.loads(
depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
assert depickled.vocab_type == xgr.VocabType.RAW

View File

@ -18,6 +18,9 @@ MODELS_TO_TEST = [
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
]
# Undo after https://github.com/vllm-project/vllm/pull/14868
pytest.skip(allow_module_level=True)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",

View File

@ -1191,7 +1191,7 @@ class EngineArgs:
NOTE: for autoselection of V0 vs V1 engine, we need to
create the ModelConfig first, since ModelConfig's attrs
(e.g. the model arch) are needed to make the decision.
This function set VLLM_USE_V1=X if VLLM_USE_V1 is
unspecified by the user.
@ -1576,6 +1576,10 @@ class EngineArgs:
#############################################################
# Experimental Features - allow users to opt in.
# MLA is is supported on V1, but off by default for now.
if model_config.use_mla and _warn_or_fallback("MLA"):
return False
# LoRA is supported on V1, but off by default for now.
if self.enable_lora and _warn_or_fallback("LORA"):
return False

View File

@ -9,6 +9,7 @@ from vllm.model_executor.guided_decoding.reasoner import get_reasoner
from vllm.model_executor.guided_decoding.utils import (
convert_lark_to_gbnf, grammar_is_likely_lark,
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
from vllm.platforms import CpuArchEnum
if TYPE_CHECKING:
from transformers import PreTrainedTokenizer
@ -25,7 +26,7 @@ def maybe_backend_fallback(
def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
fallback: str) -> None:
"""Change the backend to the specified fallback with a warning log,
"""Change the backend to the specified fallback with a warning log,
or raise a ValueError if the `no-fallback` option is specified."""
if guided_params.no_fallback():
raise ValueError(message)
@ -52,12 +53,19 @@ def maybe_backend_fallback(
if guided_params.backend_name == "xgrammar":
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
xgr_installed)
# xgrammar only has x86 wheels for linux, fallback to outlines
from vllm.platforms import current_platform
if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
fallback_or_error(guided_params,
"xgrammar is only supported on x86 CPUs.",
"outlines")
# xgrammar doesn't support regex, fallback to outlines
if guided_params.regex is not None:
fallback_or_error(
guided_params,
"xgrammar does not support regex guided decoding.", "outlines")
# xgrammar doesn't support some JSON schema features
elif (guided_params.json is not None
and has_xgrammar_unsupported_json_features(guided_params.json)):

View File

@ -9,11 +9,13 @@ from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, List
import torch
from transformers import PreTrainedTokenizerFast
from vllm.logger import init_logger
try:
import xgrammar as xgr
from xgrammar.base import _core as xgr_core
xgr_installed = True
except ImportError:
xgr_installed = False
@ -33,6 +35,7 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
# TODO: passing batch size to max threads here
def get_local_xgrammar_guided_decoding_logits_processor(
guided_params: GuidedDecodingParams,
tokenizer: PreTrainedTokenizer,
@ -49,8 +52,18 @@ def get_local_xgrammar_guided_decoding_logits_processor(
@dataclass(frozen=True)
class TokenizerData:
"""Immutable container for cached tokenizer data."""
metadata: str
encoded_vocab: list[str] = field(default_factory=list)
stop_token_ids: list[int] | None = None
# These fields are mutually exclusive: `backend_str` is used to create a
# TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
# used within the constructor of TokenizeInfo
backend_str: str | None = None
vocab_type: xgr.VocabType | None = None
def __post_init__(self):
# Check for mutual exclusive
assert not (self.backend_str and self.vocab_type), \
"backend_str and vocab_type are mutual exclusive"
class TokenizerDataCache:
@ -58,52 +71,46 @@ class TokenizerDataCache:
_cache: dict[int, TokenizerData] = {}
@classmethod
def get_tokenizer_data(
cls,
tokenizer: PreTrainedTokenizer,
/,
*,
tokenizer_hash: int,
vocab_size: int,
) -> TokenizerData:
def get_tokenizer_data(cls,
tokenizer: PreTrainedTokenizer) -> TokenizerData:
tokenizer_hash = hash(tokenizer)
if tokenizer_hash not in cls._cache:
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
tokenizer,
# NOTE: We will need to use lm_head's vocab_size
# to determine correct special_token_ids for this tokenizer.
# See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92 # noqa: E501
vocab_size=vocab_size,
)
metadata = json.loads(tokenizer_info.dump_metadata())
# Vendored from xgrammar logic to get encoded_vocab
# https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
# Vendored from xgrammar logic since we cannot pickle the tokenizer
# https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
try:
vocab_dict = tokenizer.get_vocab()
encoded_vocab = [
token for token, _ in sorted(tokenizer.get_vocab().items(),
key=lambda x: x[1])
]
except AttributeError as e:
raise ValueError(
f"Cannot get the vocabulary of the tokenizer "
f"{type(tokenizer)}. The tokenizer should have a "
"get_vocab method.") from e
# maintain tokenizer's indexing
encoded_vocab = [""] * tokenizer_info.vocab_size
for token, idx in vocab_dict.items():
if idx < tokenizer_info.vocab_size:
encoded_vocab[idx] = token
stop_token_ids = None
backend_str = ""
vocab_type = xgr.VocabType.RAW
if isinstance(tokenizer, MistralTokenizer):
if stop_token_ids is None and hasattr(
tokenizer,
"eos_token_id") and tokenizer.eos_token_id is not None:
stop_token_ids = [tokenizer.eos_token_id]
if isinstance(tokenizer, PreTrainedTokenizerFast):
backend_str = tokenizer.backend_tokenizer.to_str()
vocab_type = None
elif isinstance(tokenizer, MistralTokenizer):
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
metadata.update({
"vocab_type": xgr.VocabType.BYTE_FALLBACK,
"add_prefix_space": True
})
vocab_type = xgr.VocabType.BYTE_FALLBACK
cls._cache[tokenizer_hash] = TokenizerData(
encoded_vocab=encoded_vocab,
metadata=json.dumps(metadata),
)
stop_token_ids=stop_token_ids,
backend_str=backend_str,
vocab_type=vocab_type)
return cls._cache[tokenizer_hash]
@ -122,15 +129,30 @@ class GrammarCompilerCache:
cache_key = str(config.tokenizer_hash)
if cache_key not in cls._cache:
assert config.tokenizer_data is not None
assert config.tokenizer_data.encoded_vocab is not None
config_data = config.tokenizer_data
# In TokenizerDataCache.get_tokenizer_data, a serializable
# tokenizer_data is created and cached. This data is used to build
# a tokenizer_info and create an xgrammar compiler.
tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
encoded_vocab=config_data.encoded_vocab,
metadata=config_data.metadata,
)
# - If tokenizer_data has backend_str set, use
# xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
# - Otherwise, use the default constructor with vocab_type.
# - xgr_core.TokenizerInfo.from_huggingface !=
# xgr.TokenizerInfo.from_huggingface.
if config_data.backend_str:
tokenizer_info = xgr.TokenizerInfo._create_from_handle(
xgr_core.TokenizerInfo.from_huggingface(
config_data.encoded_vocab, config_data.backend_str,
config.vocab_size, config_data.stop_token_ids))
else:
tokenizer_info = xgr.TokenizerInfo(
config_data.encoded_vocab,
config_data.vocab_type,
vocab_size=config.vocab_size,
stop_token_ids=config_data.stop_token_ids)
cls._cache[cache_key] = xgr.GrammarCompiler(
tokenizer_info, max_threads=config.max_threads)
@ -141,12 +163,13 @@ class GrammarCompilerCache:
class GrammarConfig:
"""Serializable configuration for grammar compilation"""
tokenizer_hash: int
tokenizer_data: TokenizerData
vocab_size: int
json_str: str | None = None
grammar_str: str | None = None
json_object: bool | None = None
any_whitespace: bool = True
max_threads: int = 8
tokenizer_data: TokenizerData | None = None
@classmethod
def from_guided_params(cls,
@ -156,11 +179,7 @@ class GrammarConfig:
max_threads: int = 8) -> GrammarConfig:
tokenizer_hash = hash(tokenizer)
tokenizer_data = TokenizerDataCache.get_tokenizer_data(
tokenizer,
tokenizer_hash=tokenizer_hash,
vocab_size=model_config.hf_text_config.vocab_size,
)
tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
if guided_params.json:
if not isinstance(guided_params.json, str):
@ -199,6 +218,7 @@ class GrammarConfig:
raise ValueError(str(err)) from err
return cls(json_str=json_str,
vocab_size=model_config.hf_text_config.vocab_size,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads,
tokenizer_data=tokenizer_data,
@ -226,12 +246,14 @@ class GrammarConfig:
raise ValueError(str(err)) from err
return cls(grammar_str=grammar_str,
vocab_size=model_config.hf_text_config.vocab_size,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads,
tokenizer_data=tokenizer_data)
elif guided_params.json_object:
return cls(
json_object=True,
vocab_size=model_config.hf_text_config.vocab_size,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads,
tokenizer_data=tokenizer_data,
@ -245,6 +267,7 @@ class GrammarConfig:
return cls(
grammar_str=choice_str,
vocab_size=model_config.hf_text_config.vocab_size,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads,
tokenizer_data=tokenizer_data,
@ -268,13 +291,6 @@ class GrammarConfig:
grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
return grammar
@staticmethod
def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
return xgr.TokenizerInfo.from_vocab_and_metadata(
encoded_vocab=tokenizer_data.encoded_vocab,
metadata=tokenizer_data.metadata,
)
@dataclass
class XGrammarLogitsProcessor:
@ -283,16 +299,11 @@ class XGrammarLogitsProcessor:
reasoner: Reasoner | None = None
ctx: xgr.CompiledGrammar | None = None
tokenizer_info: xgr.TokenizerInfo = None # type: ignore[assignment]
token_bitmask: torch.Tensor = None # type: ignore[assignment]
matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
batch_size: int = field(default=1)
prefilled: bool = field(default=False)
def __post_init__(self):
self.tokenizer_info = self.config.tokenizer_info(
self.config.tokenizer_data)
def __getstate__(self) -> dict[str, Any]:
return {'config': self.config, 'reasoner': self.reasoner}
@ -300,8 +311,6 @@ class XGrammarLogitsProcessor:
self.config = state['config']
self.reasoner = state['reasoner']
self.tokenizer_info = GrammarConfig.tokenizer_info(
self.config.tokenizer_data)
self.ctx = None
self.matchers = []
self.batch_size = 1
@ -343,7 +352,7 @@ class XGrammarLogitsProcessor:
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
]
self.token_bitmask = xgr.allocate_token_bitmask(
self.batch_size, self.tokenizer_info.vocab_size)
self.batch_size, self.config.vocab_size)
if not self.prefilled:
# Have not sampled a token yet

View File

@ -38,8 +38,6 @@ from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
KVCache = Tuple[torch.Tensor, torch.Tensor]
class BambaMLP(nn.Module):

View File

@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP, SupportsV0Only)
SupportsMultiModal, SupportsPP)
from .siglip import SiglipVisionModel
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
maybe_prefix, merge_multimodal_embeddings)
@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
info=Gemma3ProcessingInfo,
dummy_inputs=Gemma3DummyInputsBuilder)
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
SupportsLoRA, SupportsV0Only):
SupportsLoRA):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",

View File

@ -111,7 +111,6 @@ class MixtralAttention(nn.Module):
def __init__(
self,
config: MixtralConfig,
hidden_size: int,
num_heads: int,
num_kv_heads: int,
@ -137,9 +136,7 @@ class MixtralAttention(nn.Module):
# the KV heads across multiple tensor parallel GPUs.
assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MixtralConfig has an optional head_dim argument
self.head_dim = getattr(config, "head_dim",
self.hidden_size // self.total_num_heads)
self.head_dim = hidden_size // self.total_num_heads
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
@ -203,7 +200,6 @@ class MixtralDecoderLayer(nn.Module):
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = MixtralAttention(
config=config,
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,

View File

@ -165,7 +165,6 @@ class MixtralAttention(nn.Module):
def __init__(
self,
config: MixtralConfig,
hidden_size: int,
num_heads: int,
num_kv_heads: int,
@ -191,9 +190,7 @@ class MixtralAttention(nn.Module):
# the KV heads across multiple tensor parallel GPUs.
assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MixtralConfig has an optional head_dim argument
self.head_dim = getattr(config, "head_dim",
self.hidden_size // self.total_num_heads)
self.head_dim = hidden_size // self.total_num_heads
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
@ -255,7 +252,6 @@ class MixtralDecoderLayer(nn.Module):
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = MixtralAttention(
config=config,
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,

View File

@ -73,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
"""
A boolean mask indicating which image embeddings correspond
to patch tokens.
Shape: `(batch_size, num_images, num_embeds)`
"""
@ -849,10 +849,10 @@ class VisionTransformer(nn.Module):
) -> torch.Tensor:
"""
Args:
images: list of N_img images of variable sizes,
images: list of N_img images of variable sizes,
each of shape (C, H, W)
Returns:
image_features: tensor of token features for
image_features: tensor of token features for
all tokens of all images of shape (N_toks, D)
"""
# pass images through initial convolution independently
@ -935,8 +935,7 @@ class PatchMerger(nn.Module):
# x is (N, vision_encoder_dim)
x = self.permute(x, image_sizes)
# x is (N / spatial_merge_size ** 2,
# vision_encoder_dim * spatial_merge_size ** 2)
# x is (N / spatial_merge_size ** 2, vision_encoder_dim * spatial_merge_size ** 2)
x = self.merging_layer(x)
# x is (N / spatial_merge_size ** 2, vision_encoder_dim)

View File

@ -40,7 +40,7 @@ class StructuredOutputManager:
tokenizer_group.ping()
tokenizer = tokenizer_group.get_lora_tokenizer(None)
self.vocab_size = self.vllm_config.model_config.get_vocab_size()
self.vocab_size = len(tokenizer.get_vocab())
if isinstance(tokenizer, MistralTokenizer):
# NOTE: ideally, xgrammar should handle this accordingly.
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98