Compare commits
4 Commits
v0.8.0rc2
...
mamba_test
| Author | SHA1 | Date | |
|---|---|---|---|
| 031c8b32a4 | |||
| ac08d45200 | |||
| a5d29e9ee1 | |||
| 696245c2fc |
@ -200,7 +200,6 @@ steps:
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
@ -454,6 +453,15 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||
|
||||
- label: SSM and Hybrid Models Test # 12min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language/test_hybrid.py
|
||||
- tests/models/decoder_only/language/test_mamba.py
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language/test_hybrid.py
|
||||
- pytest -v -s models/decoder_only/language/test_mamba.py
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
optional: true
|
||||
|
||||
@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
*
|
||||
* ⚠️
|
||||
- * `GLM4VForCausalLM`<sup>^</sup>
|
||||
* GLM-4V
|
||||
* T + I
|
||||
@ -948,11 +948,8 @@ V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
||||
- Will be updated in the future to support the correct behavior
|
||||
- Does not support `"do_pan_and_scan": True`
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
|
||||
@ -20,7 +20,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer >= 0.10.11, < 0.11
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common==1.5.4
|
||||
mistral-common==1.5.1
|
||||
# via -r requirements/test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import json
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
@ -209,6 +208,8 @@ def test_guided_decoding_backend_options():
|
||||
|
||||
|
||||
def test_pickle_xgrammar_tokenizer_data():
|
||||
|
||||
# TODO: move to another test file for xgrammar
|
||||
try:
|
||||
import xgrammar as xgr
|
||||
except ImportError:
|
||||
@ -216,11 +217,7 @@ def test_pickle_xgrammar_tokenizer_data():
|
||||
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||
TokenizerData)
|
||||
tokenizer_data = TokenizerData(
|
||||
metadata=
|
||||
'{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
|
||||
encoded_vocab=['!', '"', '#', '$', '%'],
|
||||
)
|
||||
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
|
||||
pickled = pickle.dumps(tokenizer_data)
|
||||
|
||||
assert pickled is not None
|
||||
@ -228,5 +225,4 @@ def test_pickle_xgrammar_tokenizer_data():
|
||||
depickled: TokenizerData = pickle.loads(pickled)
|
||||
|
||||
assert depickled is not None
|
||||
assert json.loads(
|
||||
depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
|
||||
assert depickled.vocab_type == xgr.VocabType.RAW
|
||||
|
||||
@ -18,6 +18,9 @@ MODELS_TO_TEST = [
|
||||
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
||||
]
|
||||
|
||||
# Undo after https://github.com/vllm-project/vllm/pull/14868
|
||||
pytest.skip(allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
|
||||
@ -1191,7 +1191,7 @@ class EngineArgs:
|
||||
NOTE: for autoselection of V0 vs V1 engine, we need to
|
||||
create the ModelConfig first, since ModelConfig's attrs
|
||||
(e.g. the model arch) are needed to make the decision.
|
||||
|
||||
|
||||
This function set VLLM_USE_V1=X if VLLM_USE_V1 is
|
||||
unspecified by the user.
|
||||
|
||||
@ -1576,6 +1576,10 @@ class EngineArgs:
|
||||
#############################################################
|
||||
# Experimental Features - allow users to opt in.
|
||||
|
||||
# MLA is is supported on V1, but off by default for now.
|
||||
if model_config.use_mla and _warn_or_fallback("MLA"):
|
||||
return False
|
||||
|
||||
# LoRA is supported on V1, but off by default for now.
|
||||
if self.enable_lora and _warn_or_fallback("LORA"):
|
||||
return False
|
||||
|
||||
@ -9,6 +9,7 @@ from vllm.model_executor.guided_decoding.reasoner import get_reasoner
|
||||
from vllm.model_executor.guided_decoding.utils import (
|
||||
convert_lark_to_gbnf, grammar_is_likely_lark,
|
||||
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
|
||||
from vllm.platforms import CpuArchEnum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer
|
||||
@ -25,7 +26,7 @@ def maybe_backend_fallback(
|
||||
|
||||
def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
|
||||
fallback: str) -> None:
|
||||
"""Change the backend to the specified fallback with a warning log,
|
||||
"""Change the backend to the specified fallback with a warning log,
|
||||
or raise a ValueError if the `no-fallback` option is specified."""
|
||||
if guided_params.no_fallback():
|
||||
raise ValueError(message)
|
||||
@ -52,12 +53,19 @@ def maybe_backend_fallback(
|
||||
if guided_params.backend_name == "xgrammar":
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||
xgr_installed)
|
||||
# xgrammar only has x86 wheels for linux, fallback to outlines
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
|
||||
fallback_or_error(guided_params,
|
||||
"xgrammar is only supported on x86 CPUs.",
|
||||
"outlines")
|
||||
|
||||
# xgrammar doesn't support regex, fallback to outlines
|
||||
if guided_params.regex is not None:
|
||||
fallback_or_error(
|
||||
guided_params,
|
||||
"xgrammar does not support regex guided decoding.", "outlines")
|
||||
|
||||
# xgrammar doesn't support some JSON schema features
|
||||
elif (guided_params.json is not None
|
||||
and has_xgrammar_unsupported_json_features(guided_params.json)):
|
||||
|
||||
@ -9,11 +9,13 @@ from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any, List
|
||||
|
||||
import torch
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
try:
|
||||
import xgrammar as xgr
|
||||
from xgrammar.base import _core as xgr_core
|
||||
xgr_installed = True
|
||||
except ImportError:
|
||||
xgr_installed = False
|
||||
@ -33,6 +35,7 @@ if TYPE_CHECKING:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
# TODO: passing batch size to max threads here
|
||||
def get_local_xgrammar_guided_decoding_logits_processor(
|
||||
guided_params: GuidedDecodingParams,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
@ -49,8 +52,18 @@ def get_local_xgrammar_guided_decoding_logits_processor(
|
||||
@dataclass(frozen=True)
|
||||
class TokenizerData:
|
||||
"""Immutable container for cached tokenizer data."""
|
||||
metadata: str
|
||||
encoded_vocab: list[str] = field(default_factory=list)
|
||||
stop_token_ids: list[int] | None = None
|
||||
# These fields are mutually exclusive: `backend_str` is used to create a
|
||||
# TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
|
||||
# used within the constructor of TokenizeInfo
|
||||
backend_str: str | None = None
|
||||
vocab_type: xgr.VocabType | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
# Check for mutual exclusive
|
||||
assert not (self.backend_str and self.vocab_type), \
|
||||
"backend_str and vocab_type are mutual exclusive"
|
||||
|
||||
|
||||
class TokenizerDataCache:
|
||||
@ -58,52 +71,46 @@ class TokenizerDataCache:
|
||||
_cache: dict[int, TokenizerData] = {}
|
||||
|
||||
@classmethod
|
||||
def get_tokenizer_data(
|
||||
cls,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
/,
|
||||
*,
|
||||
tokenizer_hash: int,
|
||||
vocab_size: int,
|
||||
) -> TokenizerData:
|
||||
def get_tokenizer_data(cls,
|
||||
tokenizer: PreTrainedTokenizer) -> TokenizerData:
|
||||
tokenizer_hash = hash(tokenizer)
|
||||
|
||||
if tokenizer_hash not in cls._cache:
|
||||
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
||||
tokenizer,
|
||||
# NOTE: We will need to use lm_head's vocab_size
|
||||
# to determine correct special_token_ids for this tokenizer.
|
||||
# See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92 # noqa: E501
|
||||
vocab_size=vocab_size,
|
||||
)
|
||||
metadata = json.loads(tokenizer_info.dump_metadata())
|
||||
|
||||
# Vendored from xgrammar logic to get encoded_vocab
|
||||
# https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
|
||||
# Vendored from xgrammar logic since we cannot pickle the tokenizer
|
||||
# https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
|
||||
try:
|
||||
vocab_dict = tokenizer.get_vocab()
|
||||
encoded_vocab = [
|
||||
token for token, _ in sorted(tokenizer.get_vocab().items(),
|
||||
key=lambda x: x[1])
|
||||
]
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
f"Cannot get the vocabulary of the tokenizer "
|
||||
f"{type(tokenizer)}. The tokenizer should have a "
|
||||
"get_vocab method.") from e
|
||||
|
||||
# maintain tokenizer's indexing
|
||||
encoded_vocab = [""] * tokenizer_info.vocab_size
|
||||
for token, idx in vocab_dict.items():
|
||||
if idx < tokenizer_info.vocab_size:
|
||||
encoded_vocab[idx] = token
|
||||
stop_token_ids = None
|
||||
backend_str = ""
|
||||
vocab_type = xgr.VocabType.RAW
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
if stop_token_ids is None and hasattr(
|
||||
tokenizer,
|
||||
"eos_token_id") and tokenizer.eos_token_id is not None:
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
backend_str = tokenizer.backend_tokenizer.to_str()
|
||||
vocab_type = None
|
||||
|
||||
elif isinstance(tokenizer, MistralTokenizer):
|
||||
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
|
||||
metadata.update({
|
||||
"vocab_type": xgr.VocabType.BYTE_FALLBACK,
|
||||
"add_prefix_space": True
|
||||
})
|
||||
vocab_type = xgr.VocabType.BYTE_FALLBACK
|
||||
|
||||
cls._cache[tokenizer_hash] = TokenizerData(
|
||||
encoded_vocab=encoded_vocab,
|
||||
metadata=json.dumps(metadata),
|
||||
)
|
||||
stop_token_ids=stop_token_ids,
|
||||
backend_str=backend_str,
|
||||
vocab_type=vocab_type)
|
||||
|
||||
return cls._cache[tokenizer_hash]
|
||||
|
||||
@ -122,15 +129,30 @@ class GrammarCompilerCache:
|
||||
cache_key = str(config.tokenizer_hash)
|
||||
|
||||
if cache_key not in cls._cache:
|
||||
assert config.tokenizer_data is not None
|
||||
assert config.tokenizer_data.encoded_vocab is not None
|
||||
|
||||
config_data = config.tokenizer_data
|
||||
|
||||
# In TokenizerDataCache.get_tokenizer_data, a serializable
|
||||
# tokenizer_data is created and cached. This data is used to build
|
||||
# a tokenizer_info and create an xgrammar compiler.
|
||||
tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||
encoded_vocab=config_data.encoded_vocab,
|
||||
metadata=config_data.metadata,
|
||||
)
|
||||
# - If tokenizer_data has backend_str set, use
|
||||
# xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
|
||||
# - Otherwise, use the default constructor with vocab_type.
|
||||
# - xgr_core.TokenizerInfo.from_huggingface !=
|
||||
# xgr.TokenizerInfo.from_huggingface.
|
||||
if config_data.backend_str:
|
||||
tokenizer_info = xgr.TokenizerInfo._create_from_handle(
|
||||
xgr_core.TokenizerInfo.from_huggingface(
|
||||
config_data.encoded_vocab, config_data.backend_str,
|
||||
config.vocab_size, config_data.stop_token_ids))
|
||||
else:
|
||||
tokenizer_info = xgr.TokenizerInfo(
|
||||
config_data.encoded_vocab,
|
||||
config_data.vocab_type,
|
||||
vocab_size=config.vocab_size,
|
||||
stop_token_ids=config_data.stop_token_ids)
|
||||
cls._cache[cache_key] = xgr.GrammarCompiler(
|
||||
tokenizer_info, max_threads=config.max_threads)
|
||||
|
||||
@ -141,12 +163,13 @@ class GrammarCompilerCache:
|
||||
class GrammarConfig:
|
||||
"""Serializable configuration for grammar compilation"""
|
||||
tokenizer_hash: int
|
||||
tokenizer_data: TokenizerData
|
||||
vocab_size: int
|
||||
json_str: str | None = None
|
||||
grammar_str: str | None = None
|
||||
json_object: bool | None = None
|
||||
any_whitespace: bool = True
|
||||
max_threads: int = 8
|
||||
tokenizer_data: TokenizerData | None = None
|
||||
|
||||
@classmethod
|
||||
def from_guided_params(cls,
|
||||
@ -156,11 +179,7 @@ class GrammarConfig:
|
||||
max_threads: int = 8) -> GrammarConfig:
|
||||
|
||||
tokenizer_hash = hash(tokenizer)
|
||||
tokenizer_data = TokenizerDataCache.get_tokenizer_data(
|
||||
tokenizer,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
)
|
||||
tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
|
||||
|
||||
if guided_params.json:
|
||||
if not isinstance(guided_params.json, str):
|
||||
@ -199,6 +218,7 @@ class GrammarConfig:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(json_str=json_str,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
@ -226,12 +246,14 @@ class GrammarConfig:
|
||||
raise ValueError(str(err)) from err
|
||||
|
||||
return cls(grammar_str=grammar_str,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data)
|
||||
elif guided_params.json_object:
|
||||
return cls(
|
||||
json_object=True,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
@ -245,6 +267,7 @@ class GrammarConfig:
|
||||
|
||||
return cls(
|
||||
grammar_str=choice_str,
|
||||
vocab_size=model_config.hf_text_config.vocab_size,
|
||||
tokenizer_hash=tokenizer_hash,
|
||||
max_threads=max_threads,
|
||||
tokenizer_data=tokenizer_data,
|
||||
@ -268,13 +291,6 @@ class GrammarConfig:
|
||||
grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
|
||||
return grammar
|
||||
|
||||
@staticmethod
|
||||
def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
|
||||
return xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||
encoded_vocab=tokenizer_data.encoded_vocab,
|
||||
metadata=tokenizer_data.metadata,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class XGrammarLogitsProcessor:
|
||||
@ -283,16 +299,11 @@ class XGrammarLogitsProcessor:
|
||||
reasoner: Reasoner | None = None
|
||||
|
||||
ctx: xgr.CompiledGrammar | None = None
|
||||
tokenizer_info: xgr.TokenizerInfo = None # type: ignore[assignment]
|
||||
token_bitmask: torch.Tensor = None # type: ignore[assignment]
|
||||
matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
|
||||
batch_size: int = field(default=1)
|
||||
prefilled: bool = field(default=False)
|
||||
|
||||
def __post_init__(self):
|
||||
self.tokenizer_info = self.config.tokenizer_info(
|
||||
self.config.tokenizer_data)
|
||||
|
||||
def __getstate__(self) -> dict[str, Any]:
|
||||
return {'config': self.config, 'reasoner': self.reasoner}
|
||||
|
||||
@ -300,8 +311,6 @@ class XGrammarLogitsProcessor:
|
||||
self.config = state['config']
|
||||
self.reasoner = state['reasoner']
|
||||
|
||||
self.tokenizer_info = GrammarConfig.tokenizer_info(
|
||||
self.config.tokenizer_data)
|
||||
self.ctx = None
|
||||
self.matchers = []
|
||||
self.batch_size = 1
|
||||
@ -343,7 +352,7 @@ class XGrammarLogitsProcessor:
|
||||
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
|
||||
]
|
||||
self.token_bitmask = xgr.allocate_token_bitmask(
|
||||
self.batch_size, self.tokenizer_info.vocab_size)
|
||||
self.batch_size, self.config.vocab_size)
|
||||
|
||||
if not self.prefilled:
|
||||
# Have not sampled a token yet
|
||||
|
||||
@ -38,8 +38,6 @@ from .utils import (is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
maybe_prefix)
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
|
||||
class BambaMLP(nn.Module):
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
SupportsMultiModal, SupportsPP, SupportsV0Only)
|
||||
SupportsMultiModal, SupportsPP)
|
||||
from .siglip import SiglipVisionModel
|
||||
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
|
||||
maybe_prefix, merge_multimodal_embeddings)
|
||||
@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
|
||||
info=Gemma3ProcessingInfo,
|
||||
dummy_inputs=Gemma3DummyInputsBuilder)
|
||||
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
SupportsLoRA, SupportsV0Only):
|
||||
SupportsLoRA):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
|
||||
@ -111,7 +111,6 @@ class MixtralAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MixtralConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -137,9 +136,7 @@ class MixtralAttention(nn.Module):
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
# MixtralConfig has an optional head_dim argument
|
||||
self.head_dim = getattr(config, "head_dim",
|
||||
self.hidden_size // self.total_num_heads)
|
||||
self.head_dim = hidden_size // self.total_num_heads
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -203,7 +200,6 @@ class MixtralDecoderLayer(nn.Module):
|
||||
# Requires transformers > 4.32.0
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
self.self_attn = MixtralAttention(
|
||||
config=config,
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
max_position=config.max_position_embeddings,
|
||||
|
||||
@ -165,7 +165,6 @@ class MixtralAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: MixtralConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
@ -191,9 +190,7 @@ class MixtralAttention(nn.Module):
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
# MixtralConfig has an optional head_dim argument
|
||||
self.head_dim = getattr(config, "head_dim",
|
||||
self.hidden_size // self.total_num_heads)
|
||||
self.head_dim = hidden_size // self.total_num_heads
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -255,7 +252,6 @@ class MixtralDecoderLayer(nn.Module):
|
||||
# Requires transformers > 4.32.0
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
self.self_attn = MixtralAttention(
|
||||
config=config,
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
max_position=config.max_position_embeddings,
|
||||
|
||||
@ -73,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
|
||||
"""
|
||||
A boolean mask indicating which image embeddings correspond
|
||||
to patch tokens.
|
||||
|
||||
|
||||
Shape: `(batch_size, num_images, num_embeds)`
|
||||
"""
|
||||
|
||||
@ -849,10 +849,10 @@ class VisionTransformer(nn.Module):
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
images: list of N_img images of variable sizes,
|
||||
images: list of N_img images of variable sizes,
|
||||
each of shape (C, H, W)
|
||||
Returns:
|
||||
image_features: tensor of token features for
|
||||
image_features: tensor of token features for
|
||||
all tokens of all images of shape (N_toks, D)
|
||||
"""
|
||||
# pass images through initial convolution independently
|
||||
@ -935,8 +935,7 @@ class PatchMerger(nn.Module):
|
||||
# x is (N, vision_encoder_dim)
|
||||
x = self.permute(x, image_sizes)
|
||||
|
||||
# x is (N / spatial_merge_size ** 2,
|
||||
# vision_encoder_dim * spatial_merge_size ** 2)
|
||||
# x is (N / spatial_merge_size ** 2, vision_encoder_dim * spatial_merge_size ** 2)
|
||||
x = self.merging_layer(x)
|
||||
|
||||
# x is (N / spatial_merge_size ** 2, vision_encoder_dim)
|
||||
|
||||
@ -40,7 +40,7 @@ class StructuredOutputManager:
|
||||
tokenizer_group.ping()
|
||||
|
||||
tokenizer = tokenizer_group.get_lora_tokenizer(None)
|
||||
self.vocab_size = self.vllm_config.model_config.get_vocab_size()
|
||||
self.vocab_size = len(tokenizer.get_vocab())
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
# NOTE: ideally, xgrammar should handle this accordingly.
|
||||
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
|
||||
|
||||
Reference in New Issue
Block a user