Compare commits

..

14 Commits

Author SHA1 Message Date
8a8b40d417 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-16 20:40:35 +00:00
c3f7afa6a8 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-16 20:39:45 +00:00
6cd8dec23f updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-16 20:29:24 +00:00
723263fa23 updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-15 22:06:34 +00:00
f29fd8a7f8 [BugFix] fix 3 issues: (1) using metadata for causal-conv1d, (2) indexing overflow in v1 vLLM, and (3) init_states in v0 (#20838)
Signed-off-by: Tuan M. Hoang-Trong <tmhoangt@us.ibm.com>
Co-authored-by: Tuan M. Hoang-Trong <tmhoangt@us.ibm.com>
2025-07-15 16:08:26 -04:00
ed10f3cea1 [ROCm] warpSize is being made non constexpr in ROCm 7.0 (#20330)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-15 14:01:44 -04:00
b637e9dcb8 Add full serve CLI reference back to docs (#20978)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 17:42:30 +00:00
1e36c8687e [Deprecation] Remove nullable_kvs (#20969)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 17:21:50 +00:00
5bac61362b Configure Gemini (#20971)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 09:37:05 -07:00
313ae8c16a [Deprecation] Remove everything scheduled for removal in v0.10.0 (#20979)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 15:57:53 +00:00
c847e34b39 [CI/Build] Fix wrong path in Transformers Nightly Models Test (#20994)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-15 08:53:16 -07:00
e7e3e6d263 Voxtral (#20970)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-15 07:35:30 -07:00
4ffd963fa0 [v1][core] Support for attention free models (#20811)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
2025-07-15 14:20:01 +00:00
56fe4bedd6 [Deprecation] Remove TokenizerPoolConfig (#20968)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-15 14:00:50 +00:00
44 changed files with 1072 additions and 413 deletions

View File

@ -645,7 +645,7 @@ steps:
optional: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s models/test_initialization.py
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py

6
.gemini/config.yaml Normal file
View File

@ -0,0 +1,6 @@
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
have_fun: false # Just review the code
code_review:
comment_severity_threshold: HIGH # Reduce quantity of comments
pull_request_opened:
summary: false # Don't summarize the PR in a separate comment

View File

@ -30,17 +30,11 @@ from datasets import load_dataset
from PIL import Image
from transformers import PreTrainedTokenizerBase
try:
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
except:
MultiModalDataDict = None
AnyTokenizer = None
LoRARequest = None
print("Install vLLM to use LoRA or Multimodal benchmarking.")
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
logger = logging.getLogger(__name__)

View File

@ -24,6 +24,7 @@
#include "attention_dtypes.h"
#include "attention_utils.cuh"
#include "cuda_compat.h"
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
@ -33,12 +34,6 @@ typedef __hip_bfloat16 __nv_bfloat16;
#include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@ -670,7 +665,6 @@ __global__ void paged_attention_v2_reduce_kernel(
} // namespace vllm
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP

View File

@ -18,12 +18,7 @@
*/
#include "attention_kernels.cuh"
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#include "cuda_compat.h"
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -187,7 +182,6 @@ void paged_attention_v1(
CALL_V1_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP

View File

@ -18,12 +18,7 @@
*/
#include "attention_kernels.cuh"
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#include "cuda_compat.h"
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -197,7 +192,6 @@ void paged_attention_v2(
CALL_V2_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP

View File

@ -4,10 +4,10 @@
#include <hip/hip_runtime.h>
#endif
#ifndef USE_ROCM
#define WARP_SIZE 32
#if defined(USE_ROCM) && defined(__GFX9__)
#define WARP_SIZE 64
#else
#define WARP_SIZE warpSize
#define WARP_SIZE 32
#endif
#ifndef USE_ROCM

View File

@ -8,7 +8,6 @@ API documentation for vLLM's configuration classes.
- [vllm.config.ModelConfig][]
- [vllm.config.CacheConfig][]
- [vllm.config.TokenizerPoolConfig][]
- [vllm.config.LoadConfig][]
- [vllm.config.ParallelConfig][]
- [vllm.config.SchedulerConfig][]

View File

@ -1,3 +1,7 @@
---
toc_depth: 4
---
# vLLM CLI Guide
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -42,6 +46,10 @@ Start the vLLM OpenAI Compatible API server.
vllm serve --help=page
```
### Options
--8<-- "docs/argparse/serve.md"
## chat
Generate chat completions via the running API server.

View File

@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
## CLI Arguments
The `vllm serve` command is used to launch the OpenAI-compatible server.
To see the available CLI arguments, run `vllm serve --help`!
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
## Configuration file

View File

@ -103,9 +103,7 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m
vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
## Automatic Function Calling

View File

@ -16,6 +16,7 @@ sys.modules["blake3"] = MagicMock()
sys.modules["vllm._C"] = MagicMock()
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
from vllm.utils import FlexibleArgumentParser # noqa: E402
logger = logging.getLogger("mkdocs")
@ -24,15 +25,18 @@ logger = logging.getLogger("mkdocs")
class MarkdownFormatter(HelpFormatter):
"""Custom formatter that generates markdown for argument groups."""
def __init__(self, prog):
def __init__(self, prog, starting_heading_level=3):
super().__init__(prog,
max_help_position=float('inf'),
width=float('inf'))
self._section_heading_prefix = "#" * starting_heading_level
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
self._markdown_output = []
def start_section(self, heading):
if heading not in {"positional arguments", "options"}:
self._markdown_output.append(f"\n### {heading}\n\n")
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
self._markdown_output.append(heading_md)
def end_section(self):
pass
@ -46,9 +50,13 @@ class MarkdownFormatter(HelpFormatter):
def add_arguments(self, actions):
for action in actions:
if (len(action.option_strings) == 0
or "--help" in action.option_strings):
continue
option_strings = f'`{"`, `".join(action.option_strings)}`'
self._markdown_output.append(f"#### {option_strings}\n\n")
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
self._markdown_output.append(heading_md)
if choices := action.choices:
choices = f'`{"`, `".join(str(c) for c in choices)}`'
@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
return cls.add_cli_args(parser, **kwargs)
def create_serve_parser() -> FlexibleArgumentParser:
"""Create a parser for the serve command with markdown formatting."""
parser = FlexibleArgumentParser()
parser.formatter_class = lambda prog: MarkdownFormatter(
prog, starting_heading_level=4)
return make_arg_parser(parser)
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
logger.info("Generating argparse documentation")
logger.debug("Root directory: %s", ROOT_DIR.resolve())
@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
"engine_args": create_parser(EngineArgs),
"async_engine_args": create_parser(AsyncEngineArgs,
async_args_only=True),
"serve": create_serve_parser(),
}
# Generate documentation for each parser

View File

@ -10,7 +10,7 @@ on HuggingFace model repository.
import os
from dataclasses import asdict
from typing import NamedTuple, Optional
from typing import Any, NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
@ -30,7 +30,9 @@ question_per_audio_count = {
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str
prompt: Optional[str] = None
prompt_token_ids: Optional[dict[str, list[int]]] = None
multi_modal_data: Optional[dict[str, Any]] = None
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4.
# Voxtral
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (
AudioChunk,
RawAudio,
TextChunk,
UserMessage,
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
model_name = "mistralai/Voxtral-Mini-3B-2507"
tokenizer = MistralTokenizer.from_hf_hub(model_name)
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
config_format="mistral",
load_format="mistral",
tokenizer_mode="mistral",
enforce_eager=True,
enable_chunked_prefill=False,
)
text_chunk = TextChunk(text=question)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(audio_count)
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
messages = [UserMessage(content=[*audio_chunks, text_chunk])]
req = ChatCompletionRequest(messages=messages, model=model_name)
tokens = tokenizer.encode_chat_completion(req)
prompt_ids, audios = tokens.tokens, tokens.audios
audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
multi_modal_data = {"audio": audios_and_sr}
return ModelRequestData(
engine_args=engine_args,
prompt_token_ids=prompt_ids,
multi_modal_data=multi_modal_data,
)
# Granite Speech
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# NOTE - the setting in this example are somehat different than what is
@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
model_example_map = {
"voxtral": run_voxtral,
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,
@ -311,16 +368,24 @@ def main(args):
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
)
mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
]
}
mm_data = req_data.multi_modal_data
if not mm_data:
mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
]
}
assert args.num_prompts > 0
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
inputs = {"multi_modal_data": mm_data}
if req_data.prompt:
inputs["prompt"] = req_data.prompt
else:
inputs["prompt_token_ids"] = req_data.prompt_token_ids
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts

View File

@ -33,7 +33,7 @@ pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
importlib_metadata; python_version < '3.10'
mistral_common[opencv] >= 1.6.2
mistral_common[opencv] >= 1.8.0
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12

View File

@ -17,6 +17,7 @@ cloudpickle
fastapi
msgspec
openai
partial-json-parser
pillow
psutil
pybase64

View File

@ -23,7 +23,7 @@ jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.6.2 # required for pixtral test
mistral_common[opencv] >= 1.8.0 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test

View File

@ -28,7 +28,7 @@ torchvision==0.22.0
transformers_stream_generator # required for qwen-vl test
mamba_ssm # required for plamo2 test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.7.0 # required for pixtral test
mistral_common[opencv] >= 1.8.0 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test

View File

@ -305,7 +305,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.7.0
mistral-common==1.8.0
# via -r requirements/test.in
more-itertools==10.5.0
# via lm-eval
@ -518,6 +518,8 @@ pyasn1-modules==0.4.2
# via google-auth
pybind11==2.13.6
# via lm-eval
pycountry==24.6.1
# via pydantic-extra-types
pycparser==2.22
# via cffi
pycryptodomex==3.22.0
@ -528,9 +530,12 @@ pydantic==2.11.5
# datamodel-code-generator
# mistral-common
# mteb
# pydantic-extra-types
# ray
pydantic-core==2.33.2
# via pydantic
pydantic-extra-types==2.10.5
# via mistral-common
pygments==2.18.0
# via rich
pyparsing==3.2.0
@ -835,6 +840,7 @@ typing-extensions==4.12.2
# pqdm
# pydantic
# pydantic-core
# pydantic-extra-types
# torch
# typer
# typing-inspection

View File

@ -692,7 +692,8 @@ setup(
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing
"audio": ["librosa", "soundfile",
"mistral_common[audio]"], # Required for audio processing
"video": [] # Kept for backwards compatibility
},
cmdclass=cmdclass,

View File

@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
def api_server(distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
@ -40,8 +40,6 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
"facebook/opt-125m",
"--host",
"127.0.0.1",
"--tokenizer-pool-size",
str(tokenizer_pool_size),
"--distributed-executor-backend",
distributed_executor_backend,
]
@ -54,10 +52,8 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, tokenizer_pool_size: int,
distributed_executor_backend: str):
def test_api_server(api_server, distributed_executor_backend: str):
"""
Run the API server and test it.

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from argparse import ArgumentError, ArgumentTypeError
from argparse import ArgumentError
from contextlib import nullcontext
from dataclasses import dataclass, field
from typing import Annotated, Literal, Optional
@ -12,8 +12,8 @@ import pytest
from vllm.config import CompilationConfig, config
from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
get_type, get_type_hints, is_not_builtin,
is_type, literal_to_kwargs, nullable_kvs,
optional_type, parse_type)
is_type, literal_to_kwargs, optional_type,
parse_type)
from vllm.utils import FlexibleArgumentParser
@ -25,18 +25,10 @@ from vllm.utils import FlexibleArgumentParser
"foo": 1,
"bar": 2
}),
(json.loads, "foo=1,bar=2", {
"foo": 1,
"bar": 2
}),
])
def test_parse_type(type, value, expected):
parse_type_func = parse_type(type)
context = nullcontext()
if value == "foo=1,bar=2":
context = pytest.warns(DeprecationWarning)
with context:
assert parse_type_func(value) == expected
assert parse_type_func(value) == expected
def test_optional_type():
@ -203,34 +195,6 @@ def test_get_kwargs():
assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
@pytest.mark.parametrize(("arg", "expected"), [
(None, dict()),
("image=16", {
"image": 16
}),
("image=16,video=2", {
"image": 16,
"video": 2
}),
("Image=16, Video=2", {
"image": 16,
"video": 2
}),
])
def test_limit_mm_per_prompt_parser(arg, expected):
"""This functionality is deprecated and will be removed in the future.
This argument should be passed as JSON string instead.
TODO: Remove with nullable_kvs."""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--limit-mm-per-prompt", arg])
assert args.limit_mm_per_prompt == expected
@pytest.mark.parametrize(
("arg", "expected"),
[
@ -326,18 +290,6 @@ def test_prefix_cache_default():
assert not engine_args.enable_prefix_caching
@pytest.mark.parametrize(
("arg"),
[
"image", # Missing =
"image=4,image=5", # Conflicting values
"image=video=4" # Too many = in tokenized arg
])
def test_bad_nullable_kvs(arg):
with pytest.raises(ArgumentTypeError):
nullable_kvs(arg)
# yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
(None, None, "mm-processor-kwargs"),

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from typing import Final
import pytest
@ -29,7 +30,7 @@ def server():
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
json.dumps({"image": MAXIMUM_IMAGES}),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

View File

@ -17,6 +17,11 @@ from vllm.assets.audio import AudioAsset
from ...utils import RemoteOpenAIServer
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral"
]
@pytest.fixture
def mary_had_lamb():
@ -33,9 +38,18 @@ def winning_call():
@pytest.mark.asyncio
async def test_basic_audio(mary_had_lamb):
model_name = "openai/whisper-large-v3-turbo"
@pytest.mark.parametrize(
"model_name",
["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
async def test_basic_audio(mary_had_lamb, model_name):
server_args = ["--enforce-eager"]
if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS
# TODO(PATRICK) - REMOVE AFTER RELEASE
return # skip for now
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
@ -65,10 +79,13 @@ async def test_bad_requests(mary_had_lamb):
@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb):
model_name = "openai/whisper-large-v3-turbo"
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
async def test_long_audio_request(mary_had_lamb, model_name):
server_args = ["--enforce-eager"]
if model_name.startswith("openai"):
return
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
@ -87,7 +104,8 @@ async def test_long_audio_request(mary_had_lamb):
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
assert out.count("Mary had a little lamb") == 10
counts = out.count("Mary had a little lamb")
assert counts == 10, counts
@pytest.mark.asyncio

View File

@ -440,6 +440,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
trust_remote_code=True), # noqa: E501
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", is_available_online=False, tokenizer_mode="mistral"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
# [Cross-encoder]
@ -513,4 +514,4 @@ class HfExampleModels:
raise ValueError(f"No example model defined for {model_id}")
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)

View File

@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self, deprecated, runtime_checkable
from typing_extensions import Self, runtime_checkable
import vllm.envs as envs
from vllm import version
@ -1730,35 +1730,6 @@ class CacheConfig:
logger.warning("Possibly too large swap space. %s", msg)
@config
@dataclass
class TokenizerPoolConfig:
"""This config is deprecated and will be removed in a future release.
Passing these parameters will have no effect. Please remove them from your
configurations.
"""
pool_size: int = 0
"""This parameter is deprecated and will be removed in a future release.
Passing this parameter will have no effect. Please remove it from your
configurations."""
pool_type: str = "ray"
"""This parameter is deprecated and will be removed in a future release.
Passing this parameter will have no effect. Please remove it from your
configurations."""
extra_config: dict = field(default_factory=dict)
"""This parameter is deprecated and will be removed in a future release.
Passing this parameter will have no effect. Please remove it from your
configurations."""
def __post_init__(self) -> None:
logger.warning_once(
"TokenizerPoolConfig is deprecated and will be removed in a "
"future release. Passing this parameter will have no effect. "
"Please remove it from your configurations.")
class LoadFormat(str, enum.Enum):
AUTO = "auto"
PT = "pt"
@ -1922,10 +1893,6 @@ class ParallelConfig:
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
"""This parameter is deprecated and will be removed in a future release.
Please remove it from your configs"""
ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
@ -3692,18 +3659,6 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
class DecodingConfig:
"""Dataclass which contains the decoding strategy of the engine."""
@property
@deprecated(
"`guided_decoding_backend` is deprecated and has been renamed to "
"`backend`. This will be removed in v0.10.0. Please use the "
"`backend` argument instead.")
def guided_decoding_backend(self) -> GuidedDecodingBackend:
return self.backend
@guided_decoding_backend.setter
def guided_decoding_backend(self, value: GuidedDecodingBackend):
self.backend = value
backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
"""Which engine will be used for guided decoding (JSON schema / regex etc)
by default. With "auto", we will make opinionated choices based on request
@ -3746,9 +3701,6 @@ class DecodingConfig:
return hash_str
def __post_init__(self):
if ":" in self.backend:
self._extract_backend_options()
if envs.VLLM_USE_V1:
valid_guided_backends = get_args(GuidedDecodingBackendV1)
else:
@ -3764,24 +3716,6 @@ class DecodingConfig:
raise ValueError("disable_additional_properties is only supported "
"for the guidance backend.")
@deprecated(
"Passing guided decoding backend options inside backend in the format "
"'backend:...' is deprecated. This will be removed in v0.10.0. Please "
"use the dedicated arguments '--disable-fallback', "
"'--disable-any-whitespace' and '--disable-additional-properties' "
"instead.")
def _extract_backend_options(self):
"""Extract backend options from the backend string."""
backend, options = self.backend.split(":")
self.backend = cast(GuidedDecodingBackend, backend)
options_set = set(options.strip().split(","))
if "no-fallback" in options_set:
self.disable_fallback = True
if "disable-any-whitespace" in options_set:
self.disable_any_whitespace = True
if "no-additional-properties" in options_set:
self.disable_additional_properties = True
DetailedTraceModules = Literal["model", "worker", "all"]

View File

@ -10,6 +10,7 @@ from collections import defaultdict
from collections.abc import Iterator
from concurrent.futures import Future, ThreadPoolExecutor
from dataclasses import dataclass
from importlib import metadata
from typing import TYPE_CHECKING, Any, Optional
import msgspec
@ -42,16 +43,19 @@ EngineId = str
ReqId = str
GET_META_MSG = b"get_meta_msg"
import os
VLLM_DEBUG_NIXL_XFER_TIME = os.getenv("VLLM_DEBUG_NIXL_XFER_TIME", "1") == "1"
logger = init_logger(__name__)
# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
try:
from nixl._api import nixl_agent as NixlWrapper
logger.info("NIXL is available")
from nixl._api import nixl_agent as NixlWrapper, nixl_agent_config
NIXL_VERSION = metadata.version("nixl")
except ImportError:
logger.warning("NIXL is not available")
NixlWrapper = None
NIXL_VERSION = None
class NixlAgentMetadata(
msgspec.Struct,
@ -352,16 +356,20 @@ class NixlConnectorWorker:
def __init__(self, vllm_config: VllmConfig, engine_id: str):
if NixlWrapper is None:
logger.error("NIXL is not available")
raise RuntimeError("NIXL is not available")
logger.info("Initializing NIXL wrapper")
logger.info("Initializing NIXL worker %s", engine_id)
raise RuntimeError("NIXL is not available.")
logger.info("Initializing NIXL v%s: worker %s", NIXL_VERSION, engine_id)
# Config.
self.vllm_config = vllm_config
self.block_size = vllm_config.cache_config.block_size
# Agent.
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
import os
NIXL_NUM_WORKERS = int(os.getenv("VLLM_NIXL_NUM_WORKERS", "8"))
logger.info(f"Using NIXL_NUM_WORKERS={NIXL_NUM_WORKERS} for NIXL agent.")
config = nixl_agent_config(enable_prog_thread=False, num_threads=NIXL_NUM_WORKERS)
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
# Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
@ -449,7 +457,8 @@ class NixlConnectorWorker:
def __del__(self):
"""Cleanup background threads on destruction."""
self._handshake_initiation_executor.shutdown(wait=False)
if t_ := getattr(self, "_handshake_initiation_executor", None):
t_.shutdown(wait=False)
if self._nixl_handshake_listener_t:
self._nixl_handshake_listener_t.join(timeout=0)
@ -1019,10 +1028,16 @@ class NixlConnectorWorker:
remote_xfer_side_handle,
remote_block_descs_ids,
notif_msg=notif_id,
skip_desc_merge=True,
)
# Begin async xfer.
start = time.perf_counter()
self.nixl_wrapper.transfer(handle)
end = time.perf_counter()
if VLLM_DEBUG_NIXL_XFER_TIME:
# Log the time taken for the transfer.
logger.info(f"TIME: {end - start}")
# Use handle to check completion in future step().
# TODO (NickLucche) surface xfer elapsed time

View File

@ -9,7 +9,6 @@ import functools
import json
import sys
import threading
import warnings
from dataclasses import MISSING, dataclass, fields, is_dataclass
from itertools import permutations
from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
@ -19,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
import regex as re
import torch
from pydantic import TypeAdapter, ValidationError
from typing_extensions import TypeIs, deprecated
from typing_extensions import TypeIs
import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
@ -32,8 +31,8 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, PromptAdapterConfig,
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
TaskOption, TokenizerMode, TokenizerPoolConfig,
VllmConfig, get_attr_docs, get_field)
TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
get_field)
from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform
from vllm.plugins import load_general_plugins
@ -66,9 +65,6 @@ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
def _parse_type(val: str) -> T:
try:
if return_type is json.loads and not re.match(
r"(?s)^\s*{.*}\s*$", val):
return cast(T, nullable_kvs(val))
return return_type(val)
except ValueError as e:
raise argparse.ArgumentTypeError(
@ -94,42 +90,6 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
return optional_type(json.loads)(val)
@deprecated(
"Passing a JSON argument as a string containing comma separated key=value "
"pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
"string instead.")
def nullable_kvs(val: str) -> dict[str, int]:
"""Parses a string containing comma separate key [str] to value [int]
pairs into a dictionary.
Args:
val: String value to be parsed.
Returns:
Dictionary with parsed values.
"""
out_dict: dict[str, int] = {}
for item in val.split(","):
kv_parts = [part.lower().strip() for part in item.split("=")]
if len(kv_parts) != 2:
raise argparse.ArgumentTypeError(
"Each item should be in the form KEY=VALUE")
key, value = kv_parts
try:
parsed_value = int(value)
except ValueError as exc:
msg = f"Failed to parse value of item {key}={value}"
raise argparse.ArgumentTypeError(msg) from exc
if key in out_dict and out_dict[key] != parsed_value:
raise argparse.ArgumentTypeError(
f"Conflicting values specified for key: {key}")
out_dict[key] = parsed_value
return out_dict
def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
"""Check if the type hint is a specific type."""
return type_hint is type or get_origin(type_hint) is type
@ -373,13 +333,6 @@ class EngineArgs:
enforce_eager: bool = ModelConfig.enforce_eager
max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
# The following three fields are deprecated and will be removed in a future
# release. Setting them will have no effect. Please remove them from your
# configurations.
tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
tokenizer_pool_extra_config: dict = \
get_field(TokenizerPoolConfig, "extra_config")
limit_mm_per_prompt: dict[str, int] = \
get_field(MultiModalConfig, "limit_per_prompt")
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
@ -441,7 +394,6 @@ class EngineArgs:
speculative_config: Optional[Dict[str, Any]] = None
qlora_adapter_name_or_path: Optional[str] = None
show_hidden_metrics_for_version: Optional[str] = \
ObservabilityConfig.show_hidden_metrics_for_version
otlp_traces_endpoint: Optional[str] = \
@ -475,7 +427,6 @@ class EngineArgs:
additional_config: dict[str, Any] = \
get_field(VllmConfig, "additional_config")
enable_reasoning: Optional[bool] = None # DEPRECATED
reasoning_parser: str = DecodingConfig.reasoning_backend
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
@ -493,13 +444,6 @@ class EngineArgs:
if isinstance(self.compilation_config, (int, dict)):
self.compilation_config = CompilationConfig.from_cli(
str(self.compilation_config))
if self.qlora_adapter_name_or_path is not None:
warnings.warn(
"The `qlora_adapter_name_or_path` is deprecated "
"and will be removed in v0.10.0. ",
DeprecationWarning,
stacklevel=2,
)
# Setup plugins
from vllm.plugins import load_general_plugins
load_general_plugins()
@ -612,14 +556,6 @@ class EngineArgs:
**load_kwargs["ignore_patterns"])
load_group.add_argument("--use-tqdm-on-load",
**load_kwargs["use_tqdm_on_load"])
load_group.add_argument(
"--qlora-adapter-name-or-path",
type=str,
default=None,
help="The `--qlora-adapter-name-or-path` has no effect, do not set"
" it, and it will be removed in v0.10.0.",
deprecated=True,
)
load_group.add_argument('--pt-load-map-location',
**load_kwargs["pt_load_map_location"])
@ -640,15 +576,6 @@ class EngineArgs:
guided_decoding_group.add_argument(
"--guided-decoding-disable-additional-properties",
**guided_decoding_kwargs["disable_additional_properties"])
guided_decoding_group.add_argument(
"--enable-reasoning",
action=argparse.BooleanOptionalAction,
deprecated=True,
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
"of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
"parser backend instead. This flag (`--enable-reasoning`) will be "
"removed in v0.10.0. When `--reasoning-parser` is specified, "
"reasoning mode is automatically enabled.")
guided_decoding_group.add_argument(
"--reasoning-parser",
# This choices is a special case because it's not static
@ -751,19 +678,6 @@ class EngineArgs:
cache_group.add_argument("--calculate-kv-scales",
**cache_kwargs["calculate_kv_scales"])
# Tokenizer arguments
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
tokenizer_group = parser.add_argument_group(
title="TokenizerPoolConfig",
description=TokenizerPoolConfig.__doc__,
)
tokenizer_group.add_argument("--tokenizer-pool-size",
**tokenizer_kwargs["pool_size"])
tokenizer_group.add_argument("--tokenizer-pool-type",
**tokenizer_kwargs["pool_type"])
tokenizer_group.add_argument("--tokenizer-pool-extra-config",
**tokenizer_kwargs["extra_config"])
# Multimodal related configs
multimodal_kwargs = get_kwargs(MultiModalConfig)
multimodal_group = parser.add_argument_group(

View File

@ -67,37 +67,6 @@ class ServeSubcommand(CLISubcommand):
help="Start the vLLM OpenAI Compatible API server.",
description="Start the vLLM OpenAI Compatible API server.",
usage="vllm serve [model_tag] [options]")
serve_parser.add_argument("model_tag",
type=str,
nargs='?',
help="The model tag to serve "
"(optional if specified in config)")
serve_parser.add_argument(
"--headless",
action='store_true',
default=False,
help="Run in headless mode. See multi-node data parallel "
"documentation for more details.")
serve_parser.add_argument(
'--data-parallel-start-rank',
'-dpr',
type=int,
default=0,
help="Starting data parallel rank for secondary nodes. "
"Requires --headless.")
serve_parser.add_argument('--api-server-count',
'-asc',
type=int,
default=1,
help='How many API server processes to run.')
serve_parser.add_argument(
"--config",
type=str,
default='',
required=False,
help="Read CLI options from a config file. "
"Must be a YAML with the following options: "
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
serve_parser = make_arg_parser(serve_parser)
show_filtered_argument_or_group_from_help(serve_parser, ["serve"])

View File

@ -1514,8 +1514,6 @@ async def init_app_state(
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
expand_tools_even_if_tool_choice_none=args.
expand_tools_even_if_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
@ -1531,8 +1529,6 @@ async def init_app_state(
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
expand_tools_even_if_tool_choice_none=args.
expand_tools_even_if_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,

View File

@ -182,13 +182,6 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
"""If set to True, enable tracking server_load_metrics in the app state."""
enable_force_include_usage: bool = False
"""If set to True, including usage on every request."""
expand_tools_even_if_tool_choice_none: bool = False
"""Include tool definitions in prompts even when `tool_choice='none'`.
This is a transitional option that will be removed in v0.10.0. In
v0.10.0, tool definitions will always be included regardless of
`tool_choice` setting. Use this flag to test the upcoming behavior
before the breaking change."""
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@ -225,11 +218,6 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
# Special case for expand-tools-even-if-tool-choice-none because of
# the deprecation field
frontend_kwargs["expand_tools_even_if_tool_choice_none"]\
["deprecated"] = True
frontend_group = parser.add_argument_group(
title="Frontend",
description=FrontendArgs.__doc__,
@ -248,6 +236,34 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
register all arguments instead of manually enumerating them here. This
avoids code duplication and keeps the argument definitions in one place.
"""
parser.add_argument("model_tag",
type=str,
nargs="?",
help="The model tag to serve "
"(optional if specified in config)")
parser.add_argument(
"--headless",
action="store_true",
default=False,
help="Run in headless mode. See multi-node data parallel "
"documentation for more details.")
parser.add_argument(
"--data-parallel-start-rank",
"-dpr",
type=int,
default=0,
help="Starting data parallel rank for secondary nodes. "
"Requires --headless.")
parser.add_argument("--api-server-count",
"-asc",
type=int,
default=1,
help="How many API server processes to run.")
parser.add_argument(
"--config",
help="Read CLI options from a config file. "
"Must be a YAML with the following options: "
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
parser = FrontendArgs.add_cli_args(parser)
parser = AsyncEngineArgs.add_cli_args(parser)

View File

@ -63,7 +63,6 @@ class OpenAIServingChat(OpenAIServing):
return_tokens_as_token_ids: bool = False,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
expand_tools_even_if_tool_choice_none: bool = False,
tool_parser: Optional[str] = None,
enable_prompt_tokens_details: bool = False,
enable_force_include_usage: bool = False,
@ -112,8 +111,6 @@ class OpenAIServingChat(OpenAIServing):
raise TypeError("Error: --enable-auto-tool-choice requires "
f"tool_parser:'{tool_parser}' which has not "
"been registered") from e
self.expand_tools_even_if_tool_choice_none = (
expand_tools_even_if_tool_choice_none)
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_force_include_usage = enable_force_include_usage
@ -182,20 +179,6 @@ class OpenAIServingChat(OpenAIServing):
if request.tools is None:
tool_dicts = None
elif (request.tool_choice == "none"
and not self.expand_tools_even_if_tool_choice_none):
if len(request.tools) > 0:
logger.warning_once(
"Tools are specified but tool_choice is set to 'none' "
"and --expand-tools-even-if-tool-choice-none is not "
"enabled. Tool definitions will be excluded from the "
"prompt. This behavior will change in vLLM v0.10 where "
"tool definitions will be included by default even "
"with tool_choice='none'. To adopt the new behavior "
"now, use --expand-tools-even-if-tool-choice-none. "
"To suppress this warning, either remove tools from "
"the request or set tool_choice to a different value.")
tool_dicts = None
else:
tool_dicts = [tool.model_dump() for tool in request.tools]

View File

@ -51,7 +51,6 @@ class OpenAIServingResponses(OpenAIServing):
return_tokens_as_token_ids: bool = False,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
expand_tools_even_if_tool_choice_none: bool = False,
tool_parser: Optional[str] = None,
enable_prompt_tokens_details: bool = False,
enable_force_include_usage: bool = False,

View File

@ -112,6 +112,7 @@ class OpenAISpeechToText(OpenAIServing):
prompt = self.model_cls.get_generation_prompt(
audio=chunk,
stt_config=self.asr_config,
model_config=self.model_config,
language=lang,
task_type=self.task_type,
request_prompt=request.prompt)

View File

@ -573,8 +573,8 @@ class MambaMixer2(MambaBase, CustomOp):
x = hidden_states_B_C_p.transpose(
0, 1) # this is the form that causal-conv see
if mamba2_metadata.cu_seqlen is None:
mamba2_metadata = update_metadata(
x, attn_metadata.query_start_loc, mamba2_metadata)
mamba2_metadata = update_metadata(x, query_start_loc_p,
mamba2_metadata)
hidden_states_B_C_p = causal_conv1d_fn(
x,
conv_weights,
@ -583,6 +583,7 @@ class MambaMixer2(MambaBase, CustomOp):
conv_states=conv_state,
has_initial_state=has_initial_states_p,
cache_indices=state_indices_tensor_p,
metadata=mamba2_metadata,
query_start_loc=query_start_loc_p).transpose(
0, 1)[:num_prefill_tokens]
@ -593,9 +594,14 @@ class MambaMixer2(MambaBase, CustomOp):
initial_states = None
if (has_initial_states_p is not None and prep_initial_states):
# making a copy of the states
initial_states = torch.where(
has_initial_states_p[:, None, None, None],
ssm_state[state_indices_tensor_p], 0)
if envs.VLLM_USE_V1:
initial_states = torch.where(
has_initial_states_p[:, None, None, None],
ssm_state[state_indices_tensor_p], 0)
else:
initial_states = torch.where(
has_initial_states_p[:num_prefills, None, None, None],
ssm_state[state_indices_tensor_p], 0)
scan_output, varlen_state = mamba_chunk_scan_combined(
hidden_states_p.view(1, num_prefill_tokens,

View File

@ -55,7 +55,6 @@ def _causal_conv1d_fwd_kernel( # continuous batching
IS_CONTINUOUS_BATCHING: tl.constexpr,
USE_PAD_SLOT: tl.constexpr,
NP2_STATELEN: tl.constexpr,
DECODE_SEQLEN: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
):
@ -416,7 +415,7 @@ def causal_conv1d_fn(
activation = "silu"
args = None
out = torch.zeros_like(x)
out = torch.empty_like(x)
if metadata is not None:
cu_seqlen = metadata.cu_seqlen
nums_dict = metadata.nums_dict
@ -607,7 +606,6 @@ def causal_conv1d_fn(
IS_CONTINUOUS_BATCHING=cache_indices is not None,
USE_PAD_SLOT=pad_slot_id is not None,
NP2_STATELEN=np2_statelen,
DECODE_SEQLEN=1,
#launch_cooperative_grid=True
BLOCK_M=8,
BLOCK_N=256,
@ -665,7 +663,8 @@ def _causal_conv1d_update_kernel(
if IS_CONTINUOUS_BATCHING:
# mask = idx_seq < batch
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq)
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
tl.int64)
else:
conv_state_batch_coord = idx_seq
if USE_PAD_SLOT: # noqa

View File

@ -722,7 +722,8 @@ class SupportsTranscription(Protocol):
@classmethod
def get_generation_prompt(cls, audio: np.ndarray,
stt_config: SpeechToTextConfig, language: str,
stt_config: SpeechToTextConfig,
model_config: ModelConfig, language: str,
task_type: str,
request_prompt: str) -> PromptType:
"""Get the prompt for the ASR model.

View File

@ -231,6 +231,7 @@ _MULTIMODAL_MODELS = {
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
# [Encoder-decoder]
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501

View File

@ -0,0 +1,691 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from math import ceil
from typing import Optional, Union, cast
import numpy as np
import regex as re
import torch
import torch.nn as nn
from mistral_common.audio import mel_filter_bank
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
TextChunk, UserMessage)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.protocol.transcription.request import TranscriptionRequest
from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
from transformers import TensorType, WhisperConfig
from transformers.tokenization_utils_base import TextInput
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import SupportsPP
# yapf: disable
from vllm.model_executor.models.whisper import (
WhisperEncoder, WhisperForConditionalGeneration)
# yapf: enable
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, MultiModalHashes,
PromptReplacement, PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import (MistralTokenizer,
cached_tokenizer_from_config)
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
SupportsTranscription)
from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings)
logger = init_logger(__name__)
class VoxtralProcessorAdapter:
"""
Provide a HF-compatible interface for
:class:`mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
"""
def __init__(self, tokenizer: MistralTokenizer) -> None:
super().__init__()
self.tokenizer = tokenizer
@cached_property
def _audio_processor(self) -> AudioEncoder:
audio_encoder = self.tokenizer.instruct.audio_encoder
assert isinstance(audio_encoder, AudioEncoder)
return audio_encoder
@cached_property
def audio_token_id(self) -> int:
return self._audio_processor.special_ids.audio
@cached_property
def begin_audio_token_id(self) -> int:
return self._audio_processor.special_ids.begin_audio
# @cached_property
# def begin_transcript_token_id(self) -> int:
# return self._audio_processor.special_ids.begin_transcript
# @cached_property
# def end_transcript_token_id(self) -> int:
# return self._audio_processor.special_ids.end_transcript
@cached_property
def sampling_rate(self) -> int:
return self._audio_processor.audio_config.sampling_rate
@cached_property
def frame_rate(self) -> float:
return self._audio_processor.audio_config.frame_rate
def get_num_audio_tokens(
self,
audio_length: int,
) -> int:
pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
audio_length, self.sampling_rate)
return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
def __call__(
self,
text: Optional[Union[TextInput, list[TextInput]]] = None,
audios: Optional[Union[np.ndarray, list[np.ndarray]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> Mapping[str, NestedTensors]:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if audios is None:
audios = []
if not isinstance(audios, list):
audios = [audios]
if not audios:
input_ids = self.tokenizer(text).input_ids
return {"input_ids": torch.tensor(input_ids)}
# Allow dummy text, which is used for profiling as well as token inputs
if any(len(t) > 0 for t in text):
raise ValueError(
"You've passed text inputs instead of token inputs. "
"Make sure to process your input via `mistral_common`'s "
"tokenizer or pass a chat completion request. "
"For more info, see: "
"https://github.com/vllm-project/vllm/issues/8411.")
audios_tokens = list[torch.Tensor]()
audios_processed = list[torch.Tensor]()
for audio in audios:
assert isinstance(audio, np.ndarray)
assert audio.ndim == 1
# pad if necessary
audio = self._audio_processor.pad(audio, self.sampling_rate)
audio_tokens = [
self.begin_audio_token_id
] + [self.audio_token_id] * self.get_num_audio_tokens(len(audio))
audios_tokens.append(torch.tensor(audio_tokens))
audios_processed.append(torch.tensor(audio))
return {
"input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1),
"audio_arrays": audios_processed,
}
class VoxtralProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> MistralTokenizer:
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
if not isinstance(tokenizer, MistralTokenizer):
raise ValueError("This model requires `--tokenizer-mode mistral`")
return tokenizer
def get_hf_processor(self) -> VoxtralProcessorAdapter:
return VoxtralProcessorAdapter(self.get_tokenizer())
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"audio": 5} # Performance tends to degrade after 5
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"audio": self.get_max_audio_tokens()}
def get_max_audio_tokens(self) -> int:
return self.ctx.model_config.max_model_len
def get_max_audio_array_len(self) -> int:
processor = self.get_hf_processor()
return self.get_max_audio_tokens() * int(
processor.sampling_rate // processor.frame_rate)
class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
return ""
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0)
target_length = self.info.get_max_audio_array_len()
return {
"audio":
self._get_dummy_audios(length=target_length, num_audios=num_audios)
}
def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
tokenizer = self.info.get_tokenizer()
dummy_text = self.get_dummy_text(mm_counts)
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
dummy_audios = dummy_mm_data.get("audio", [])
audio_chunks: list[AudioChunk] = []
format = "wav"
for audio in dummy_audios:
audio_item = Audio(
audio_array=audio,
sampling_rate=self.info.get_hf_processor().sampling_rate,
format=format,
)
chunk = AudioChunk(input_audio=RawAudio.from_audio(audio_item))
audio_chunks.append(chunk)
request = ChatCompletionRequest(messages=[
UserMessage(content=[TextChunk(text=dummy_text), *audio_chunks]),
])
res = tokenizer.mistral.encode_chat_completion(request)
dummy_tokens = res.tokens
# whixtral tokenizer adds padding to the audio
# so we need to update the audio arrays
dummy_mm_data["audio"] = [a.audio_array for a in res.audios]
return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data)
class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
):
def _get_mm_fields_config(
self,
hf_inputs: Mapping[str, NestedTensors],
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(audio_arrays=MultiModalFieldConfig.batched("audio"))
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
audio_id = processor.audio_token_id
def get_replacement(item_idx: int):
audios = mm_items.get_items("audio", AudioProcessorItems)
audio_len = audios.get_audio_length(item_idx)
nb_audio_tokens = processor.get_num_audio_tokens(audio_len)
return [audio_id] * nb_audio_tokens
return [
PromptReplacement(
modality="audio",
target="", # Never match the prompt (see below note)
replacement=get_replacement,
),
]
def _cached_apply_hf_processor(
self,
prompt: Union[str, list[int]],
mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
prompt_ids, mm_kwargs, mm_hashes, _ = super(
)._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
)
# NOTE: The tokens are already inserted by the chat template
return prompt_ids, mm_kwargs, mm_hashes, True
def _get_data_parser(self) -> MultiModalDataParser:
sampling_rate = self.info.get_hf_processor().sampling_rate
return MultiModalDataParser(target_sr=sampling_rate)
@MULTIMODAL_REGISTRY.register_processor(VoxtralMultiModalProcessor,
info=VoxtralProcessingInfo,
dummy_inputs=VoxtralDummyInputsBuilder)
class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP, SupportsTranscription):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
config = vllm_config.model_config.hf_config
self.config = config
self.downsample_factor = self.config.audio_config.downsample_factor
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=config.text_config,
prefix=maybe_prefix(prefix, "language_model"),
)
self.whisper_encoder = VoxtralEncoderModel(
vllm_config.with_hf_config(config.audio_config),
prefix=maybe_prefix(prefix, "whisper_encoder"),
)
self.audio_language_adapter = AudioLanguageAdapter(
hidden_size=config.audio_config.d_model * self.downsample_factor,
dim=config.text_config.hidden_size,
)
def get_language_model(self) -> torch.nn.Module:
return self.language_model
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
audio_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
audio_embeddings)
input_ids = None
hidden_states = self.language_model.model(input_ids,
positions,
intermediate_tensors,
inputs_embeds=inputs_embeds)
return hidden_states
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...],
None]:
audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
if audio_inputs is None:
return None
audio_embeddings = self.whisper_encoder(audio_inputs)
for i, audio_embedding in enumerate(audio_embeddings):
seq_len, dim = audio_embedding.shape
# Pad such that seq_len is divisible by downsample_factor
target_seq_len = self.downsample_factor * math.ceil(
seq_len / self.downsample_factor)
audio_embedding = torch.nn.functional.pad(
audio_embedding,
(0, 0, 0, target_seq_len - seq_len),
)
audio_embeddings[i] = audio_embedding.reshape(
target_seq_len // self.downsample_factor,
dim * self.downsample_factor)
# Concat, project and resplit
audio_embeddings_packed = torch.cat(audio_embeddings, dim=0)
audio_embeddings_packed = self.audio_language_adapter(
audio_embeddings_packed)
audio_embeddings = torch.split(audio_embeddings_packed,
[a.shape[0] for a in audio_embeddings],
dim=0)
return audio_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
audio_encoder = self.tokenizer.instruct.audio_encoder
audio_tok_id = audio_encoder.audio_token
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings, audio_tok_id)
return inputs_embeds
def _parse_and_validate_audio_arrays(
self, **kwargs: object) -> Union[list[torch.Tensor], None]:
audio_arrays = kwargs.pop("audio_arrays", None)
if audio_arrays is None:
return None
if not isinstance(audio_arrays, (torch.Tensor, list)):
raise ValueError("Incorrect type of audio_arrays. "
f"Got type: {type(audio_arrays)}")
audio_arrays = flatten_bn(audio_arrays)
if isinstance(audio_arrays, torch.Tensor):
audio_arrays = list(audio_arrays.unbind(0))
return audio_arrays
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
return self.language_model.compute_logits(hidden_states,
sampling_metadata)
@classmethod
def get_speech_to_text_config(cls, model_config: ModelConfig,
task_type: str) -> SpeechToTextConfig:
tokenizer = cached_tokenizer_from_config(model_config)
audio_config = tokenizer.instruct.audio_encoder.audio_config
max_audio_clip_s = audio_config.chunk_length_s
sample_rate = audio_config.sampling_rate
return SpeechToTextConfig(
max_audio_clip_s=max_audio_clip_s,
sample_rate=sample_rate,
# mistral_common and whisper encoder take care of chunking
min_energy_split_window_size=None,
)
@classmethod
# for speech-to-text transcription
def get_generation_prompt(cls, audio: np.ndarray,
model_config: ModelConfig,
stt_config: SpeechToTextConfig, language: str,
task_type: str,
request_prompt: str) -> PromptType:
tokenizer = cached_tokenizer_from_config(model_config)
audio = Audio(audio, int(stt_config.sample_rate),
format="wav") # lossless
req = TranscriptionRequest(model=model_config.model,
audio=RawAudio.from_audio(audio),
language=language)
tokenized = tokenizer.instruct.encode_transcription(req)
audio = (tokenized.audios[0].audio_array, stt_config.sample_rate)
prompts_dict = {"multi_modal_data": {"audio": audio}}
prompts_dict["prompt_token_ids"] = tokenized.tokens
return cast(PromptType, prompts_dict)
@classmethod
def validate_language(cls, language: str) -> bool:
# same as whisper
return WhisperForConditionalGeneration.validate_language(language)
@classmethod
def get_num_audio_tokens(cls, audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig) -> Optional[int]:
"""
Map from audio duration to number of audio tokens produced by the ASR
model, without running a forward pass.
This is used for estimating the amount of processing for this audio.
"""
tokenizer = cached_tokenizer_from_config(model_config)
adapter = VoxtralProcessorAdapter(tokenizer)
return adapter.get_num_audio_tokens(
int(audio_duration_s * stt_config.sample_rate))
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
# fmt: off
remapping_rules = [
(r"mm_whisper_embeddings\.(.*)", r"\1"),
(r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
(r"audio_language_adapter\.0\.weight", r"audio_language_adapter.w_in.weight"), # noqa: E501
(r"audio_language_adapter\.2\.weight", r"audio_language_adapter.w_out.weight"), # noqa: E501
]
# fmt: on
audio_params = dict(
nn.ModuleDict({
"audio_language_adapter":
self.audio_language_adapter,
}).named_parameters())
loaded_weights = set()
def llm_weights_generator():
nonlocal loaded_weights
for name, w in weights:
is_encoder = (
name.startswith("mm_whisper_embeddings") and
not name.startswith("mm_whisper_embeddings.tok_embeddings")
and not name.startswith(
"mm_whisper_embeddings.audio_language_projection"))
for pattern, repl in remapping_rules:
if re.fullmatch(pattern, name):
name = re.sub(pattern, repl, name)
if is_encoder:
name = self.whisper_encoder.load_weight((name, w))
loaded_weights.add(f"whisper_encoder.{name}")
continue
if name in audio_params:
param = audio_params[name]
with torch.no_grad():
default_weight_loader(param, w)
loaded_weights.add(name)
else:
yield (name, w)
for name in self.language_model.load_weights(llm_weights_generator()):
loaded_weights.add(f"language_model.{name}")
# potentially manually add position embeddings
sin_key = "whisper_encoder.whisper_encoder.embed_positions.weight"
if sin_key not in loaded_weights:
# make sure we don't hit an error here
loaded_weights.add(sin_key)
return loaded_weights
class AudioLanguageAdapter(nn.Module):
def __init__(self, hidden_size: int, dim: int) -> None:
super().__init__()
self.w_in = nn.Linear(hidden_size, dim, bias=False)
self.gelu = nn.GELU()
self.w_out = nn.Linear(dim, dim, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.w_out(self.gelu(self.w_in(x)))
class VoxtralEncoderModel(nn.Module):
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
# fmt: off
mistral_remapping = [
(r"whisper_encoder\.conv_layers\.0\.(weight|bias)", r"whisper_encoder.conv1.\1"), # noqa: E501
(r"whisper_encoder\.conv_layers\.1\.(weight|bias)", r"whisper_encoder.conv2.\1"), # noqa: E501
(r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.\2_proj.\3"), # noqa: E501
(r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.out_proj.\2"), # noqa: E501
(r"whisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn_layer_norm.\2"), # noqa: E501
(r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc1.\2"), # noqa: E501
(r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc2.\2"), # noqa: E501
(r"whisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)", r"whisper_encoder.layers.\1.final_layer_norm.\2"), # noqa: E501
(r"whisper_encoder\.transformer\.norm\.(weight|bias)", r"whisper_encoder.layer_norm.\1"), # noqa: E501
]
# fmt: on
def __init__(
self,
vllm_config: VllmConfig,
*,
prefix: str = "",
) -> None:
super().__init__()
self.config = cast(WhisperConfig, vllm_config.model_config.hf_config)
self.dtype: torch.dtype = vllm_config.model_config.dtype
self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config,
prefix=maybe_prefix(
prefix, "whisper_encoder"),
is_standalone_encoder=True,
init_in_fp32=True)
mel_filters = mel_filter_bank(
num_frequency_bins=1 + self.config.window_size // 2,
num_mel_bins=self.config.num_mel_bins,
min_frequency=0.0,
max_frequency=8000.0,
sampling_rate=self.config.sampling_rate,
)
self.mel_filters = torch.tensor(mel_filters, dtype=torch.float32)
def compute_whisper_melspec(
self,
audio_waveforms: torch.Tensor,
) -> torch.Tensor:
input_dtype = audio_waveforms.dtype
window = torch.hann_window(self.config.window_size).to(
audio_waveforms.device)
stft = torch.stft(
audio_waveforms,
self.config.window_size,
self.config.hop_length,
window=window,
return_complex=True,
)
magnitudes = stft[..., :-1].abs()**2
mel_spec = self.mel_filters.T @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
return log_spec.to(input_dtype)
@property
def downsample_factor(self) -> int:
return self.whisper_encoder.conv1.stride[
0] * self.whisper_encoder.conv2.stride[0]
@property
def chunk_size(self) -> int:
return self.config.max_source_positions * self.downsample_factor
def prepare_inputs_for_conv(
self,
audio_waveforms: list[torch.Tensor],
) -> tuple[torch.Tensor, list[int]]:
assert isinstance(audio_waveforms, list)
# list[num_mel_bins, seq_len]
input_features = [
self.compute_whisper_melspec(audio).to(self.dtype)
for audio in audio_waveforms
]
chunked_features: list[torch.Tensor] = []
chunks_per_example: list[int] = []
for feature in input_features:
chunks = feature.split(self.chunk_size, dim=-1)
chunked_features += chunks
chunks_per_example.append(len(chunks))
# [total_num_chunks, num_mel_bins, chunk_size]
return torch.stack(chunked_features), chunks_per_example
def forward(
self, input_features: Union[torch.Tensor, list[torch.Tensor]]
) -> list[torch.Tensor]:
if not isinstance(input_features, list):
input_features = [input_features]
# Split long inputs into chunks
input_embeds, chunks_per_example = (
self.prepare_inputs_for_conv(input_features))
# [total_num_chunks, ceil(chunk_size / downsample_factor), hidden_size]
out = self.whisper_encoder([input_embeds])
# Re-concatenate the chunks
chunk_idx = 0
results = []
for n_chunks in chunks_per_example:
result = out[chunk_idx:chunk_idx + n_chunks].flatten(0, 1)
results.append(result)
chunk_idx += n_chunks
return results
def load_weight(self, weight: tuple[str, torch.Tensor]) -> str:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
]
params_dict = dict(self.named_parameters())
name, loaded_weight = weight
for pattern, repl in self.mistral_remapping:
if re.fullmatch(pattern, name):
name = re.sub(pattern, repl, name)
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
return name

View File

@ -3,6 +3,7 @@
import math
from collections.abc import Iterable, Mapping, Sequence
from contextlib import nullcontext
from typing import Optional, TypedDict, Union, cast
import numpy as np
@ -13,6 +14,7 @@ from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention import Attention, AttentionType
from vllm.attention.layer import MultiHeadAttention
from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
VllmConfig)
from vllm.distributed import get_tensor_model_parallel_world_size
@ -26,6 +28,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
@ -178,6 +181,7 @@ class WhisperAttention(nn.Module):
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
standalone_encoder: bool = False,
):
super().__init__()
self.embed_dim = embed_dim
@ -213,16 +217,24 @@ class WhisperAttention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.out_proj",
)
self.attn = Attention(
self.num_heads,
self.head_dim,
self.scaling,
num_kv_heads=self.num_kv_heads,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
attn_type=self.attn_type,
)
if standalone_encoder:
self.attn = MultiHeadAttention(
self.num_heads,
self.head_dim,
self.scaling,
num_kv_heads=self.num_kv_heads,
)
else:
self.attn = Attention(
self.num_heads,
self.head_dim,
self.scaling,
num_kv_heads=self.num_kv_heads,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
attn_type=self.attn_type,
)
def _init_qkv(
self,
@ -357,7 +369,11 @@ class WhisperMLP(nn.Module):
class WhisperEncoderLayer(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = "",
is_standalone_encoder: bool = False):
super().__init__()
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
@ -371,6 +387,7 @@ class WhisperEncoderLayer(nn.Module):
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
standalone_encoder=is_standalone_encoder,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.mlp = WhisperMLP(
@ -462,10 +479,16 @@ class WhisperDecoderLayer(nn.Module):
class WhisperEncoder(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = "",
is_standalone_encoder: bool = False,
init_in_fp32: bool = False):
super().__init__()
config = vllm_config.model_config.hf_config
embed_dim = config.d_model
self.is_standalone_encoder = is_standalone_encoder
self.num_mel_bins = config.num_mel_bins
self.max_source_positions = config.max_source_positions
self.embed_scale = (math.sqrt(embed_dim)
@ -480,17 +503,25 @@ class WhisperEncoder(nn.Module):
kernel_size=3,
stride=2,
padding=1)
self.embed_positions = nn.Embedding(self.max_source_positions,
embed_dim)
self.start_layer, self.end_layer, self.layers = make_layers(
config.encoder_layers,
lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
prefix=f"{prefix}.layers"),
prefix=f"{prefix}.layers",
is_standalone_encoder=
is_standalone_encoder),
prefix=f"{prefix}.layers",
)
self.layer_norm = nn.LayerNorm(config.d_model)
with torch.no_grad():
maybe_fp32_init_ctx = set_default_torch_dtype(
torch.float32) if init_in_fp32 else nullcontext()
with (
torch.no_grad(),
maybe_fp32_init_ctx,
):
self.embed_positions = nn.Embedding(self.max_source_positions,
embed_dim)
self.embed_positions.weight.copy_(
sinusoids(*self.embed_positions.weight.shape))
@ -499,8 +530,10 @@ class WhisperEncoder(nn.Module):
for features in input_features:
embeds = nn.functional.gelu(self.conv1(features))
embeds = nn.functional.gelu(self.conv2(embeds))
embeds = embeds.permute(1, 0)
embeds = embeds + self.embed_positions.weight[:embeds.size(0), :]
embeds = embeds.transpose(-1, -2)
embeds = (embeds +
self.embed_positions.weight[:embeds.size(-2), :]).to(
embeds.dtype)
hidden_states.append(embeds)
hidden_states = torch.cat(hidden_states)
@ -792,10 +825,14 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
f"or {list(ISO639_1_OTHER_LANGS.values())}")
@classmethod
def get_generation_prompt(cls, audio: np.ndarray,
stt_config: SpeechToTextConfig, language: str,
task_type: str,
request_prompt: str) -> PromptType:
def get_generation_prompt(
cls,
audio: np.ndarray,
model_config: ModelConfig, # not needed here
stt_config: SpeechToTextConfig,
language: str,
task_type: str,
request_prompt: str) -> PromptType:
prompt = {
"encoder_prompt": {
# Whisper does not support encoder prompt.

View File

@ -9,7 +9,6 @@ from typing import Annotated, Any, Optional, Union
import msgspec
from pydantic import BaseModel
from typing_extensions import deprecated
from vllm.logger import init_logger
from vllm.logits_process import LogitsProcessor
@ -84,27 +83,6 @@ class GuidedDecodingParams:
"You can only use one kind of guided decoding but multiple are "
f"specified: {self.__dict__}")
if self.backend is not None and ":" in self.backend:
self._extract_backend_options()
@deprecated(
"Passing guided decoding backend options inside backend in the format "
"'backend:...' is deprecated. This will be removed in v0.10.0. Please "
"use the dedicated arguments '--disable-fallback', "
"'--disable-any-whitespace' and '--disable-additional-properties' "
"instead.")
def _extract_backend_options(self):
"""Extract backend options from the backend string."""
assert isinstance(self.backend, str)
self.backend, options = self.backend.split(":")
options_set = set(options.strip().split(","))
if "no-fallback" in options_set:
self.disable_fallback = True
if "disable-any-whitespace" in options_set:
self.disable_any_whitespace = True
if "no-additional-properties" in options_set:
self.disable_additional_properties = True
class RequestOutputKind(Enum):
# Return entire output so far in every RequestOutput

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers import PretrainedConfig
from transformers import PretrainedConfig, WhisperConfig
from vllm.logger import init_logger
@ -24,9 +24,21 @@ def adapt_config_dict(config_dict: dict[str, Any],
if bool(config_dict.get("yarn")):
config_dict = _remap_mistral_yarn_args(config_dict)
if bool((config_dict.get("multimodal") or {}).get("vision_encoder_args")
or config_dict.get("vision_encoder")):
is_vision = ((config_dict.get("multimodal")
or {}).get("vision_encoder_args")
or config_dict.get("vision_encoder"))
is_audio = bool(
((config_dict.get("multimodal") or {}).get("whisper_model_args")
or {}).get("encoder_args"))
assert not (is_vision and is_audio), \
"Vision and audio are mutually exclusive"
if is_vision:
config_dict = _remap_mistral_vision_args(config_dict)
if is_audio:
config_dict = _remap_mistral_audio_args(config_dict)
config = PretrainedConfig.from_dict(config_dict)
@ -118,3 +130,35 @@ def _remap_mistral_quantization_args(config: dict) -> dict:
config["quantization_config"] = quantization_config
return config
def _remap_mistral_audio_args(config: dict) -> dict:
whisper_args = config["multimodal"].pop("whisper_model_args")
encoder_args = whisper_args["encoder_args"]
downsample_args = whisper_args["downsample_args"]
quant_config = config.get("quantization_config")
config = {
"model_type":
"whixtral",
"architectures": ["VoxtralForConditionalGeneration"],
"text_config":
PretrainedConfig.from_dict(config),
"audio_config":
WhisperConfig(
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
window_size=encoder_args["audio_encoding_args"]["window_size"],
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
downsample_factor=downsample_args["downsample_factor"],
d_model=encoder_args["dim"],
encoder_layers=encoder_args["n_layers"],
encoder_ffn_dim=encoder_args["hidden_dim"],
encoder_attention_heads=encoder_args["n_heads"],
vocab_size=encoder_args["vocab_size"],
max_source_positions=encoder_args["max_source_positions"],
)
}
if quant_config:
config["quantization_config"] = quant_config
return config

View File

@ -78,7 +78,12 @@ class KVCacheManager:
) -> None:
self.max_model_len = max_model_len
if len(kv_cache_config.kv_cache_groups) == 0:
# Attention free models don't have kv cache,
# thus don't need prefix caching.
enable_caching = False
self.enable_caching = enable_caching
self.caching_hash_fn = (
sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
sha256 if caching_hash_algo == "sha256" else hash)
@ -101,7 +106,7 @@ class KVCacheManager:
kv_cache_config=kv_cache_config,
max_model_len=self.max_model_len,
use_eagle=self.use_eagle,
enable_caching=enable_caching,
enable_caching=self.enable_caching,
caching_hash_fn=self.caching_hash_fn,
enable_kv_cache_events=enable_kv_cache_events,
)

View File

@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
ValueError: If there is not enough memory available for the KV cache.
"""
# No need to check for available memory if the kv_cache_spec is empty
if not kv_cache_spec:
return
if available_memory <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
return len(page_sizes) == 1
def is_kv_cache_type_attention_free(
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
# kv_cache_spec is an empty dict for attention free models
return not kv_cache_spec
def _get_kv_cache_config_uniform_page_size(
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
available_memory: int) -> KVCacheConfig:
@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
return kv_cache_config
def _get_kv_cache_config_attention_free() -> KVCacheConfig:
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
"""
This function tries to convert the KV cache specs to one type if the model
@ -957,7 +972,11 @@ def get_kv_cache_config(
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
unify_hybrid_kv_cache_specs(kv_cache_spec)
if is_kv_cache_type_uniform(kv_cache_spec):
if is_kv_cache_type_attention_free(kv_cache_spec):
# This returns a kv_cache config with 0 kv_cache groups and 1 block
# to allow for the KVCache manager to handle attention free models.
return _get_kv_cache_config_attention_free()
elif is_kv_cache_type_uniform(kv_cache_spec):
# KV cache of all layers are the same, which is true for
# most models. Allocate the same amount of memory for
# each layer.

View File

@ -139,7 +139,13 @@ class EngineCore:
# Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache.
available_gpu_memory = self.model_executor.determine_available_memory()
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
if has_kv_cache:
available_gpu_memory = \
self.model_executor.determine_available_memory()
else:
# Attention free models don't need memory for kv cache
available_gpu_memory = [0] * len(kv_cache_specs)
assert len(kv_cache_specs) == len(available_gpu_memory)
# Get the kv cache tensor size