Compare commits
14 Commits
benchmark
...
nixl-upstr
| Author | SHA1 | Date | |
|---|---|---|---|
| 8a8b40d417 | |||
| c3f7afa6a8 | |||
| 6cd8dec23f | |||
| 723263fa23 | |||
| f29fd8a7f8 | |||
| ed10f3cea1 | |||
| b637e9dcb8 | |||
| 1e36c8687e | |||
| 5bac61362b | |||
| 313ae8c16a | |||
| c847e34b39 | |||
| e7e3e6d263 | |||
| 4ffd963fa0 | |||
| 56fe4bedd6 |
@ -645,7 +645,7 @@ steps:
|
||||
optional: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
|
||||
6
.gemini/config.yaml
Normal file
6
.gemini/config.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
|
||||
have_fun: false # Just review the code
|
||||
code_review:
|
||||
comment_severity_threshold: HIGH # Reduce quantity of comments
|
||||
pull_request_opened:
|
||||
summary: false # Don't summarize the PR in a separate comment
|
||||
@ -30,17 +30,11 @@ from datasets import load_dataset
|
||||
from PIL import Image
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
try:
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||
except:
|
||||
MultiModalDataDict = None
|
||||
AnyTokenizer = None
|
||||
LoRARequest = None
|
||||
print("Install vLLM to use LoRA or Multimodal benchmarking.")
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
|
||||
#include "attention_dtypes.h"
|
||||
#include "attention_utils.cuh"
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
@ -33,12 +34,6 @@ typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||
@ -670,7 +665,6 @@ __global__ void paged_attention_v2_reduce_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -18,12 +18,7 @@
|
||||
*/
|
||||
|
||||
#include "attention_kernels.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -187,7 +182,6 @@ void paged_attention_v1(
|
||||
CALL_V1_LAUNCHER_BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -18,12 +18,7 @@
|
||||
*/
|
||||
|
||||
#include "attention_kernels.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -197,7 +192,6 @@ void paged_attention_v2(
|
||||
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -4,10 +4,10 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#if defined(USE_ROCM) && defined(__GFX9__)
|
||||
#define WARP_SIZE 64
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
@ -8,7 +8,6 @@ API documentation for vLLM's configuration classes.
|
||||
|
||||
- [vllm.config.ModelConfig][]
|
||||
- [vllm.config.CacheConfig][]
|
||||
- [vllm.config.TokenizerPoolConfig][]
|
||||
- [vllm.config.LoadConfig][]
|
||||
- [vllm.config.ParallelConfig][]
|
||||
- [vllm.config.SchedulerConfig][]
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
---
|
||||
toc_depth: 4
|
||||
---
|
||||
|
||||
# vLLM CLI Guide
|
||||
|
||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||
@ -42,6 +46,10 @@ Start the vLLM OpenAI Compatible API server.
|
||||
vllm serve --help=page
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
|
||||
## chat
|
||||
|
||||
Generate chat completions via the running API server.
|
||||
|
||||
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
## CLI Arguments
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
To see the available CLI arguments, run `vllm serve --help`!
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
|
||||
|
||||
## Configuration file
|
||||
|
||||
|
||||
@ -103,9 +103,7 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m
|
||||
|
||||
vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
|
||||
|
||||
By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
|
||||
|
||||
Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
|
||||
However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
|
||||
|
||||
## Automatic Function Calling
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@ sys.modules["blake3"] = MagicMock()
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
|
||||
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
@ -24,15 +25,18 @@ logger = logging.getLogger("mkdocs")
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
"""Custom formatter that generates markdown for argument groups."""
|
||||
|
||||
def __init__(self, prog):
|
||||
def __init__(self, prog, starting_heading_level=3):
|
||||
super().__init__(prog,
|
||||
max_help_position=float('inf'),
|
||||
width=float('inf'))
|
||||
self._section_heading_prefix = "#" * starting_heading_level
|
||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||
self._markdown_output = []
|
||||
|
||||
def start_section(self, heading):
|
||||
if heading not in {"positional arguments", "options"}:
|
||||
self._markdown_output.append(f"\n### {heading}\n\n")
|
||||
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
|
||||
def end_section(self):
|
||||
pass
|
||||
@ -46,9 +50,13 @@ class MarkdownFormatter(HelpFormatter):
|
||||
|
||||
def add_arguments(self, actions):
|
||||
for action in actions:
|
||||
if (len(action.option_strings) == 0
|
||||
or "--help" in action.option_strings):
|
||||
continue
|
||||
|
||||
option_strings = f'`{"`, `".join(action.option_strings)}`'
|
||||
self._markdown_output.append(f"#### {option_strings}\n\n")
|
||||
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
|
||||
if choices := action.choices:
|
||||
choices = f'`{"`, `".join(str(c) for c in choices)}`'
|
||||
@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
||||
return cls.add_cli_args(parser, **kwargs)
|
||||
|
||||
|
||||
def create_serve_parser() -> FlexibleArgumentParser:
|
||||
"""Create a parser for the serve command with markdown formatting."""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.formatter_class = lambda prog: MarkdownFormatter(
|
||||
prog, starting_heading_level=4)
|
||||
return make_arg_parser(parser)
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
logger.info("Generating argparse documentation")
|
||||
logger.debug("Root directory: %s", ROOT_DIR.resolve())
|
||||
@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
"engine_args": create_parser(EngineArgs),
|
||||
"async_engine_args": create_parser(AsyncEngineArgs,
|
||||
async_args_only=True),
|
||||
"serve": create_serve_parser(),
|
||||
}
|
||||
|
||||
# Generate documentation for each parser
|
||||
|
||||
@ -10,7 +10,7 @@ on HuggingFace model repository.
|
||||
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
from typing import Any, NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
@ -30,7 +30,9 @@ question_per_audio_count = {
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
prompt: Optional[str] = None
|
||||
prompt_token_ids: Optional[dict[str, list[int]]] = None
|
||||
multi_modal_data: Optional[dict[str, Any]] = None
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Voxtral
|
||||
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (
|
||||
AudioChunk,
|
||||
RawAudio,
|
||||
TextChunk,
|
||||
UserMessage,
|
||||
)
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
model_name = "mistralai/Voxtral-Mini-3B-2507"
|
||||
tokenizer = MistralTokenizer.from_hf_hub(model_name)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
tokenizer_mode="mistral",
|
||||
enforce_eager=True,
|
||||
enable_chunked_prefill=False,
|
||||
)
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(audio_count)
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk])]
|
||||
|
||||
req = ChatCompletionRequest(messages=messages, model=model_name)
|
||||
|
||||
tokens = tokenizer.encode_chat_completion(req)
|
||||
prompt_ids, audios = tokens.tokens, tokens.audios
|
||||
|
||||
audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
|
||||
|
||||
multi_modal_data = {"audio": audios_and_sr}
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt_token_ids=prompt_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
)
|
||||
|
||||
|
||||
# Granite Speech
|
||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||
# NOTE - the setting in this example are somehat different than what is
|
||||
@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"voxtral": run_voxtral,
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
@ -311,16 +368,24 @@ def main(args):
|
||||
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
|
||||
)
|
||||
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
mm_data = req_data.multi_modal_data
|
||||
if not mm_data:
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
|
||||
assert args.num_prompts > 0
|
||||
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
|
||||
inputs = {"multi_modal_data": mm_data}
|
||||
|
||||
if req_data.prompt:
|
||||
inputs["prompt"] = req_data.prompt
|
||||
else:
|
||||
inputs["prompt_token_ids"] = req_data.prompt_token_ids
|
||||
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
|
||||
@ -33,7 +33,7 @@ pyzmq >= 25.0.0
|
||||
msgspec
|
||||
gguf >= 0.13.0
|
||||
importlib_metadata; python_version < '3.10'
|
||||
mistral_common[opencv] >= 1.6.2
|
||||
mistral_common[opencv] >= 1.8.0
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
|
||||
@ -17,6 +17,7 @@ cloudpickle
|
||||
fastapi
|
||||
msgspec
|
||||
openai
|
||||
partial-json-parser
|
||||
pillow
|
||||
psutil
|
||||
pybase64
|
||||
|
||||
@ -23,7 +23,7 @@ jiwer # required for audio tests
|
||||
timm # required for internvl test
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.6.2 # required for pixtral test
|
||||
mistral_common[opencv] >= 1.8.0 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
|
||||
@ -28,7 +28,7 @@ torchvision==0.22.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
mamba_ssm # required for plamo2 test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.7.0 # required for pixtral test
|
||||
mistral_common[opencv] >= 1.8.0 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
|
||||
@ -305,7 +305,7 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common==1.7.0
|
||||
mistral-common==1.8.0
|
||||
# via -r requirements/test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
@ -518,6 +518,8 @@ pyasn1-modules==0.4.2
|
||||
# via google-auth
|
||||
pybind11==2.13.6
|
||||
# via lm-eval
|
||||
pycountry==24.6.1
|
||||
# via pydantic-extra-types
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pycryptodomex==3.22.0
|
||||
@ -528,9 +530,12 @@ pydantic==2.11.5
|
||||
# datamodel-code-generator
|
||||
# mistral-common
|
||||
# mteb
|
||||
# pydantic-extra-types
|
||||
# ray
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pydantic-extra-types==2.10.5
|
||||
# via mistral-common
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyparsing==3.2.0
|
||||
@ -835,6 +840,7 @@ typing-extensions==4.12.2
|
||||
# pqdm
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# pydantic-extra-types
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspection
|
||||
|
||||
3
setup.py
3
setup.py
@ -692,7 +692,8 @@ setup(
|
||||
"tensorizer": ["tensorizer==2.10.1"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile"], # Required for audio processing
|
||||
"audio": ["librosa", "soundfile",
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"video": [] # Kept for backwards compatibility
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
|
||||
@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
def api_server(distributed_executor_backend: str):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
@ -40,8 +40,6 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
"facebook/opt-125m",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--tokenizer-pool-size",
|
||||
str(tokenizer_pool_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_executor_backend,
|
||||
]
|
||||
@ -54,10 +52,8 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
uvicorn_process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
||||
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||
distributed_executor_backend: str):
|
||||
def test_api_server(api_server, distributed_executor_backend: str):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from argparse import ArgumentError, ArgumentTypeError
|
||||
from argparse import ArgumentError
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Annotated, Literal, Optional
|
||||
@ -12,8 +12,8 @@ import pytest
|
||||
from vllm.config import CompilationConfig, config
|
||||
from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
|
||||
get_type, get_type_hints, is_not_builtin,
|
||||
is_type, literal_to_kwargs, nullable_kvs,
|
||||
optional_type, parse_type)
|
||||
is_type, literal_to_kwargs, optional_type,
|
||||
parse_type)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -25,18 +25,10 @@ from vllm.utils import FlexibleArgumentParser
|
||||
"foo": 1,
|
||||
"bar": 2
|
||||
}),
|
||||
(json.loads, "foo=1,bar=2", {
|
||||
"foo": 1,
|
||||
"bar": 2
|
||||
}),
|
||||
])
|
||||
def test_parse_type(type, value, expected):
|
||||
parse_type_func = parse_type(type)
|
||||
context = nullcontext()
|
||||
if value == "foo=1,bar=2":
|
||||
context = pytest.warns(DeprecationWarning)
|
||||
with context:
|
||||
assert parse_type_func(value) == expected
|
||||
assert parse_type_func(value) == expected
|
||||
|
||||
|
||||
def test_optional_type():
|
||||
@ -203,34 +195,6 @@ def test_get_kwargs():
|
||||
assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("arg", "expected"), [
|
||||
(None, dict()),
|
||||
("image=16", {
|
||||
"image": 16
|
||||
}),
|
||||
("image=16,video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
("Image=16, Video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
])
|
||||
def test_limit_mm_per_prompt_parser(arg, expected):
|
||||
"""This functionality is deprecated and will be removed in the future.
|
||||
This argument should be passed as JSON string instead.
|
||||
|
||||
TODO: Remove with nullable_kvs."""
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args(["--limit-mm-per-prompt", arg])
|
||||
|
||||
assert args.limit_mm_per_prompt == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg", "expected"),
|
||||
[
|
||||
@ -326,18 +290,6 @@ def test_prefix_cache_default():
|
||||
assert not engine_args.enable_prefix_caching
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg"),
|
||||
[
|
||||
"image", # Missing =
|
||||
"image=4,image=5", # Conflicting values
|
||||
"image=video=4" # Too many = in tokenized arg
|
||||
])
|
||||
def test_bad_nullable_kvs(arg):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
nullable_kvs(arg)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
@ -29,7 +30,7 @@ def server():
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
f"image={MAXIMUM_IMAGES}",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
|
||||
@ -17,6 +17,11 @@ from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
@ -33,9 +38,18 @@ def winning_call():
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
# TODO(PATRICK) - REMOVE AFTER RELEASE
|
||||
return # skip for now
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -65,10 +79,13 @@ async def test_bad_requests(mary_had_lamb):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
|
||||
async def test_long_audio_request(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("openai"):
|
||||
return
|
||||
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
@ -87,7 +104,8 @@ async def test_long_audio_request(mary_had_lamb):
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
assert out.count("Mary had a little lamb") == 10
|
||||
counts = out.count("Mary had a little lamb")
|
||||
assert counts == 10, counts
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -440,6 +440,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", is_available_online=False, tokenizer_mode="mistral"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
|
||||
# [Cross-encoder]
|
||||
@ -513,4 +514,4 @@ class HfExampleModels:
|
||||
raise ValueError(f"No example model defined for {model_id}")
|
||||
|
||||
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
||||
|
||||
@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
|
||||
from pydantic.dataclasses import dataclass
|
||||
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
||||
from torch.distributed import ProcessGroup, ReduceOp
|
||||
from typing_extensions import Self, deprecated, runtime_checkable
|
||||
from typing_extensions import Self, runtime_checkable
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import version
|
||||
@ -1730,35 +1730,6 @@ class CacheConfig:
|
||||
logger.warning("Possibly too large swap space. %s", msg)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class TokenizerPoolConfig:
|
||||
"""This config is deprecated and will be removed in a future release.
|
||||
|
||||
Passing these parameters will have no effect. Please remove them from your
|
||||
configurations.
|
||||
"""
|
||||
|
||||
pool_size: int = 0
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Passing this parameter will have no effect. Please remove it from your
|
||||
configurations."""
|
||||
pool_type: str = "ray"
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Passing this parameter will have no effect. Please remove it from your
|
||||
configurations."""
|
||||
extra_config: dict = field(default_factory=dict)
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Passing this parameter will have no effect. Please remove it from your
|
||||
configurations."""
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
logger.warning_once(
|
||||
"TokenizerPoolConfig is deprecated and will be removed in a "
|
||||
"future release. Passing this parameter will have no effect. "
|
||||
"Please remove it from your configurations.")
|
||||
|
||||
|
||||
class LoadFormat(str, enum.Enum):
|
||||
AUTO = "auto"
|
||||
PT = "pt"
|
||||
@ -1922,10 +1893,6 @@ class ParallelConfig:
|
||||
disable_custom_all_reduce: bool = False
|
||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||
|
||||
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Please remove it from your configs"""
|
||||
|
||||
ray_workers_use_nsight: bool = False
|
||||
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
||||
|
||||
@ -3692,18 +3659,6 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
|
||||
class DecodingConfig:
|
||||
"""Dataclass which contains the decoding strategy of the engine."""
|
||||
|
||||
@property
|
||||
@deprecated(
|
||||
"`guided_decoding_backend` is deprecated and has been renamed to "
|
||||
"`backend`. This will be removed in v0.10.0. Please use the "
|
||||
"`backend` argument instead.")
|
||||
def guided_decoding_backend(self) -> GuidedDecodingBackend:
|
||||
return self.backend
|
||||
|
||||
@guided_decoding_backend.setter
|
||||
def guided_decoding_backend(self, value: GuidedDecodingBackend):
|
||||
self.backend = value
|
||||
|
||||
backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
|
||||
"""Which engine will be used for guided decoding (JSON schema / regex etc)
|
||||
by default. With "auto", we will make opinionated choices based on request
|
||||
@ -3746,9 +3701,6 @@ class DecodingConfig:
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
if ":" in self.backend:
|
||||
self._extract_backend_options()
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
valid_guided_backends = get_args(GuidedDecodingBackendV1)
|
||||
else:
|
||||
@ -3764,24 +3716,6 @@ class DecodingConfig:
|
||||
raise ValueError("disable_additional_properties is only supported "
|
||||
"for the guidance backend.")
|
||||
|
||||
@deprecated(
|
||||
"Passing guided decoding backend options inside backend in the format "
|
||||
"'backend:...' is deprecated. This will be removed in v0.10.0. Please "
|
||||
"use the dedicated arguments '--disable-fallback', "
|
||||
"'--disable-any-whitespace' and '--disable-additional-properties' "
|
||||
"instead.")
|
||||
def _extract_backend_options(self):
|
||||
"""Extract backend options from the backend string."""
|
||||
backend, options = self.backend.split(":")
|
||||
self.backend = cast(GuidedDecodingBackend, backend)
|
||||
options_set = set(options.strip().split(","))
|
||||
if "no-fallback" in options_set:
|
||||
self.disable_fallback = True
|
||||
if "disable-any-whitespace" in options_set:
|
||||
self.disable_any_whitespace = True
|
||||
if "no-additional-properties" in options_set:
|
||||
self.disable_additional_properties = True
|
||||
|
||||
|
||||
DetailedTraceModules = Literal["model", "worker", "all"]
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ from collections import defaultdict
|
||||
from collections.abc import Iterator
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from importlib import metadata
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
import msgspec
|
||||
@ -42,16 +43,19 @@ EngineId = str
|
||||
ReqId = str
|
||||
GET_META_MSG = b"get_meta_msg"
|
||||
|
||||
import os
|
||||
VLLM_DEBUG_NIXL_XFER_TIME = os.getenv("VLLM_DEBUG_NIXL_XFER_TIME", "1") == "1"
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
|
||||
try:
|
||||
from nixl._api import nixl_agent as NixlWrapper
|
||||
logger.info("NIXL is available")
|
||||
from nixl._api import nixl_agent as NixlWrapper, nixl_agent_config
|
||||
NIXL_VERSION = metadata.version("nixl")
|
||||
except ImportError:
|
||||
logger.warning("NIXL is not available")
|
||||
NixlWrapper = None
|
||||
|
||||
NIXL_VERSION = None
|
||||
|
||||
class NixlAgentMetadata(
|
||||
msgspec.Struct,
|
||||
@ -352,16 +356,20 @@ class NixlConnectorWorker:
|
||||
def __init__(self, vllm_config: VllmConfig, engine_id: str):
|
||||
if NixlWrapper is None:
|
||||
logger.error("NIXL is not available")
|
||||
raise RuntimeError("NIXL is not available")
|
||||
logger.info("Initializing NIXL wrapper")
|
||||
logger.info("Initializing NIXL worker %s", engine_id)
|
||||
raise RuntimeError("NIXL is not available.")
|
||||
logger.info("Initializing NIXL v%s: worker %s", NIXL_VERSION, engine_id)
|
||||
|
||||
# Config.
|
||||
self.vllm_config = vllm_config
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
|
||||
# Agent.
|
||||
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
|
||||
import os
|
||||
NIXL_NUM_WORKERS = int(os.getenv("VLLM_NIXL_NUM_WORKERS", "8"))
|
||||
logger.info(f"Using NIXL_NUM_WORKERS={NIXL_NUM_WORKERS} for NIXL agent.")
|
||||
|
||||
config = nixl_agent_config(enable_prog_thread=False, num_threads=NIXL_NUM_WORKERS)
|
||||
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
|
||||
# Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
|
||||
self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
|
||||
|
||||
@ -449,7 +457,8 @@ class NixlConnectorWorker:
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup background threads on destruction."""
|
||||
self._handshake_initiation_executor.shutdown(wait=False)
|
||||
if t_ := getattr(self, "_handshake_initiation_executor", None):
|
||||
t_.shutdown(wait=False)
|
||||
if self._nixl_handshake_listener_t:
|
||||
self._nixl_handshake_listener_t.join(timeout=0)
|
||||
|
||||
@ -1019,10 +1028,16 @@ class NixlConnectorWorker:
|
||||
remote_xfer_side_handle,
|
||||
remote_block_descs_ids,
|
||||
notif_msg=notif_id,
|
||||
skip_desc_merge=True,
|
||||
)
|
||||
|
||||
# Begin async xfer.
|
||||
start = time.perf_counter()
|
||||
self.nixl_wrapper.transfer(handle)
|
||||
end = time.perf_counter()
|
||||
if VLLM_DEBUG_NIXL_XFER_TIME:
|
||||
# Log the time taken for the transfer.
|
||||
logger.info(f"TIME: {end - start}")
|
||||
|
||||
# Use handle to check completion in future step().
|
||||
# TODO (NickLucche) surface xfer elapsed time
|
||||
|
||||
@ -9,7 +9,6 @@ import functools
|
||||
import json
|
||||
import sys
|
||||
import threading
|
||||
import warnings
|
||||
from dataclasses import MISSING, dataclass, fields, is_dataclass
|
||||
from itertools import permutations
|
||||
from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
|
||||
@ -19,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
from typing_extensions import TypeIs, deprecated
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
@ -32,8 +31,8 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
ObservabilityConfig, ParallelConfig, PoolerConfig,
|
||||
PrefixCachingHashAlgo, PromptAdapterConfig,
|
||||
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
|
||||
TaskOption, TokenizerMode, TokenizerPoolConfig,
|
||||
VllmConfig, get_attr_docs, get_field)
|
||||
TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
|
||||
get_field)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.plugins import load_general_plugins
|
||||
@ -66,9 +65,6 @@ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
|
||||
|
||||
def _parse_type(val: str) -> T:
|
||||
try:
|
||||
if return_type is json.loads and not re.match(
|
||||
r"(?s)^\s*{.*}\s*$", val):
|
||||
return cast(T, nullable_kvs(val))
|
||||
return return_type(val)
|
||||
except ValueError as e:
|
||||
raise argparse.ArgumentTypeError(
|
||||
@ -94,42 +90,6 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
|
||||
return optional_type(json.loads)(val)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"Passing a JSON argument as a string containing comma separated key=value "
|
||||
"pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
|
||||
"string instead.")
|
||||
def nullable_kvs(val: str) -> dict[str, int]:
|
||||
"""Parses a string containing comma separate key [str] to value [int]
|
||||
pairs into a dictionary.
|
||||
|
||||
Args:
|
||||
val: String value to be parsed.
|
||||
|
||||
Returns:
|
||||
Dictionary with parsed values.
|
||||
"""
|
||||
out_dict: dict[str, int] = {}
|
||||
for item in val.split(","):
|
||||
kv_parts = [part.lower().strip() for part in item.split("=")]
|
||||
if len(kv_parts) != 2:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Each item should be in the form KEY=VALUE")
|
||||
key, value = kv_parts
|
||||
|
||||
try:
|
||||
parsed_value = int(value)
|
||||
except ValueError as exc:
|
||||
msg = f"Failed to parse value of item {key}={value}"
|
||||
raise argparse.ArgumentTypeError(msg) from exc
|
||||
|
||||
if key in out_dict and out_dict[key] != parsed_value:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"Conflicting values specified for key: {key}")
|
||||
out_dict[key] = parsed_value
|
||||
|
||||
return out_dict
|
||||
|
||||
|
||||
def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
|
||||
"""Check if the type hint is a specific type."""
|
||||
return type_hint is type or get_origin(type_hint) is type
|
||||
@ -373,13 +333,6 @@ class EngineArgs:
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
# The following three fields are deprecated and will be removed in a future
|
||||
# release. Setting them will have no effect. Please remove them from your
|
||||
# configurations.
|
||||
tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
|
||||
tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
|
||||
tokenizer_pool_extra_config: dict = \
|
||||
get_field(TokenizerPoolConfig, "extra_config")
|
||||
limit_mm_per_prompt: dict[str, int] = \
|
||||
get_field(MultiModalConfig, "limit_per_prompt")
|
||||
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
|
||||
@ -441,7 +394,6 @@ class EngineArgs:
|
||||
|
||||
speculative_config: Optional[Dict[str, Any]] = None
|
||||
|
||||
qlora_adapter_name_or_path: Optional[str] = None
|
||||
show_hidden_metrics_for_version: Optional[str] = \
|
||||
ObservabilityConfig.show_hidden_metrics_for_version
|
||||
otlp_traces_endpoint: Optional[str] = \
|
||||
@ -475,7 +427,6 @@ class EngineArgs:
|
||||
|
||||
additional_config: dict[str, Any] = \
|
||||
get_field(VllmConfig, "additional_config")
|
||||
enable_reasoning: Optional[bool] = None # DEPRECATED
|
||||
reasoning_parser: str = DecodingConfig.reasoning_backend
|
||||
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
@ -493,13 +444,6 @@ class EngineArgs:
|
||||
if isinstance(self.compilation_config, (int, dict)):
|
||||
self.compilation_config = CompilationConfig.from_cli(
|
||||
str(self.compilation_config))
|
||||
if self.qlora_adapter_name_or_path is not None:
|
||||
warnings.warn(
|
||||
"The `qlora_adapter_name_or_path` is deprecated "
|
||||
"and will be removed in v0.10.0. ",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
# Setup plugins
|
||||
from vllm.plugins import load_general_plugins
|
||||
load_general_plugins()
|
||||
@ -612,14 +556,6 @@ class EngineArgs:
|
||||
**load_kwargs["ignore_patterns"])
|
||||
load_group.add_argument("--use-tqdm-on-load",
|
||||
**load_kwargs["use_tqdm_on_load"])
|
||||
load_group.add_argument(
|
||||
"--qlora-adapter-name-or-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The `--qlora-adapter-name-or-path` has no effect, do not set"
|
||||
" it, and it will be removed in v0.10.0.",
|
||||
deprecated=True,
|
||||
)
|
||||
load_group.add_argument('--pt-load-map-location',
|
||||
**load_kwargs["pt_load_map_location"])
|
||||
|
||||
@ -640,15 +576,6 @@ class EngineArgs:
|
||||
guided_decoding_group.add_argument(
|
||||
"--guided-decoding-disable-additional-properties",
|
||||
**guided_decoding_kwargs["disable_additional_properties"])
|
||||
guided_decoding_group.add_argument(
|
||||
"--enable-reasoning",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
deprecated=True,
|
||||
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
|
||||
"of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
|
||||
"parser backend instead. This flag (`--enable-reasoning`) will be "
|
||||
"removed in v0.10.0. When `--reasoning-parser` is specified, "
|
||||
"reasoning mode is automatically enabled.")
|
||||
guided_decoding_group.add_argument(
|
||||
"--reasoning-parser",
|
||||
# This choices is a special case because it's not static
|
||||
@ -751,19 +678,6 @@ class EngineArgs:
|
||||
cache_group.add_argument("--calculate-kv-scales",
|
||||
**cache_kwargs["calculate_kv_scales"])
|
||||
|
||||
# Tokenizer arguments
|
||||
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
|
||||
tokenizer_group = parser.add_argument_group(
|
||||
title="TokenizerPoolConfig",
|
||||
description=TokenizerPoolConfig.__doc__,
|
||||
)
|
||||
tokenizer_group.add_argument("--tokenizer-pool-size",
|
||||
**tokenizer_kwargs["pool_size"])
|
||||
tokenizer_group.add_argument("--tokenizer-pool-type",
|
||||
**tokenizer_kwargs["pool_type"])
|
||||
tokenizer_group.add_argument("--tokenizer-pool-extra-config",
|
||||
**tokenizer_kwargs["extra_config"])
|
||||
|
||||
# Multimodal related configs
|
||||
multimodal_kwargs = get_kwargs(MultiModalConfig)
|
||||
multimodal_group = parser.add_argument_group(
|
||||
|
||||
@ -67,37 +67,6 @@ class ServeSubcommand(CLISubcommand):
|
||||
help="Start the vLLM OpenAI Compatible API server.",
|
||||
description="Start the vLLM OpenAI Compatible API server.",
|
||||
usage="vllm serve [model_tag] [options]")
|
||||
serve_parser.add_argument("model_tag",
|
||||
type=str,
|
||||
nargs='?',
|
||||
help="The model tag to serve "
|
||||
"(optional if specified in config)")
|
||||
serve_parser.add_argument(
|
||||
"--headless",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Run in headless mode. See multi-node data parallel "
|
||||
"documentation for more details.")
|
||||
serve_parser.add_argument(
|
||||
'--data-parallel-start-rank',
|
||||
'-dpr',
|
||||
type=int,
|
||||
default=0,
|
||||
help="Starting data parallel rank for secondary nodes. "
|
||||
"Requires --headless.")
|
||||
serve_parser.add_argument('--api-server-count',
|
||||
'-asc',
|
||||
type=int,
|
||||
default=1,
|
||||
help='How many API server processes to run.')
|
||||
serve_parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default='',
|
||||
required=False,
|
||||
help="Read CLI options from a config file. "
|
||||
"Must be a YAML with the following options: "
|
||||
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
|
||||
|
||||
serve_parser = make_arg_parser(serve_parser)
|
||||
show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
|
||||
|
||||
@ -1514,8 +1514,6 @@ async def init_app_state(
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
expand_tools_even_if_tool_choice_none=args.
|
||||
expand_tools_even_if_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
@ -1531,8 +1529,6 @@ async def init_app_state(
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
expand_tools_even_if_tool_choice_none=args.
|
||||
expand_tools_even_if_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
|
||||
@ -182,13 +182,6 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
"""If set to True, enable tracking server_load_metrics in the app state."""
|
||||
enable_force_include_usage: bool = False
|
||||
"""If set to True, including usage on every request."""
|
||||
expand_tools_even_if_tool_choice_none: bool = False
|
||||
"""Include tool definitions in prompts even when `tool_choice='none'`.
|
||||
|
||||
This is a transitional option that will be removed in v0.10.0. In
|
||||
v0.10.0, tool definitions will always be included regardless of
|
||||
`tool_choice` setting. Use this flag to test the upcoming behavior
|
||||
before the breaking change."""
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
@ -225,11 +218,6 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
|
||||
frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
|
||||
|
||||
# Special case for expand-tools-even-if-tool-choice-none because of
|
||||
# the deprecation field
|
||||
frontend_kwargs["expand_tools_even_if_tool_choice_none"]\
|
||||
["deprecated"] = True
|
||||
|
||||
frontend_group = parser.add_argument_group(
|
||||
title="Frontend",
|
||||
description=FrontendArgs.__doc__,
|
||||
@ -248,6 +236,34 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
register all arguments instead of manually enumerating them here. This
|
||||
avoids code duplication and keeps the argument definitions in one place.
|
||||
"""
|
||||
parser.add_argument("model_tag",
|
||||
type=str,
|
||||
nargs="?",
|
||||
help="The model tag to serve "
|
||||
"(optional if specified in config)")
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Run in headless mode. See multi-node data parallel "
|
||||
"documentation for more details.")
|
||||
parser.add_argument(
|
||||
"--data-parallel-start-rank",
|
||||
"-dpr",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Starting data parallel rank for secondary nodes. "
|
||||
"Requires --headless.")
|
||||
parser.add_argument("--api-server-count",
|
||||
"-asc",
|
||||
type=int,
|
||||
default=1,
|
||||
help="How many API server processes to run.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
help="Read CLI options from a config file. "
|
||||
"Must be a YAML with the following options: "
|
||||
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
|
||||
parser = FrontendArgs.add_cli_args(parser)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
|
||||
@ -63,7 +63,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
reasoning_parser: str = "",
|
||||
enable_auto_tools: bool = False,
|
||||
expand_tools_even_if_tool_choice_none: bool = False,
|
||||
tool_parser: Optional[str] = None,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
@ -112,8 +111,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
raise TypeError("Error: --enable-auto-tool-choice requires "
|
||||
f"tool_parser:'{tool_parser}' which has not "
|
||||
"been registered") from e
|
||||
self.expand_tools_even_if_tool_choice_none = (
|
||||
expand_tools_even_if_tool_choice_none)
|
||||
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.enable_force_include_usage = enable_force_include_usage
|
||||
@ -182,20 +179,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
if request.tools is None:
|
||||
tool_dicts = None
|
||||
elif (request.tool_choice == "none"
|
||||
and not self.expand_tools_even_if_tool_choice_none):
|
||||
if len(request.tools) > 0:
|
||||
logger.warning_once(
|
||||
"Tools are specified but tool_choice is set to 'none' "
|
||||
"and --expand-tools-even-if-tool-choice-none is not "
|
||||
"enabled. Tool definitions will be excluded from the "
|
||||
"prompt. This behavior will change in vLLM v0.10 where "
|
||||
"tool definitions will be included by default even "
|
||||
"with tool_choice='none'. To adopt the new behavior "
|
||||
"now, use --expand-tools-even-if-tool-choice-none. "
|
||||
"To suppress this warning, either remove tools from "
|
||||
"the request or set tool_choice to a different value.")
|
||||
tool_dicts = None
|
||||
else:
|
||||
tool_dicts = [tool.model_dump() for tool in request.tools]
|
||||
|
||||
|
||||
@ -51,7 +51,6 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
reasoning_parser: str = "",
|
||||
enable_auto_tools: bool = False,
|
||||
expand_tools_even_if_tool_choice_none: bool = False,
|
||||
tool_parser: Optional[str] = None,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
|
||||
@ -112,6 +112,7 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
prompt = self.model_cls.get_generation_prompt(
|
||||
audio=chunk,
|
||||
stt_config=self.asr_config,
|
||||
model_config=self.model_config,
|
||||
language=lang,
|
||||
task_type=self.task_type,
|
||||
request_prompt=request.prompt)
|
||||
|
||||
@ -573,8 +573,8 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
x = hidden_states_B_C_p.transpose(
|
||||
0, 1) # this is the form that causal-conv see
|
||||
if mamba2_metadata.cu_seqlen is None:
|
||||
mamba2_metadata = update_metadata(
|
||||
x, attn_metadata.query_start_loc, mamba2_metadata)
|
||||
mamba2_metadata = update_metadata(x, query_start_loc_p,
|
||||
mamba2_metadata)
|
||||
hidden_states_B_C_p = causal_conv1d_fn(
|
||||
x,
|
||||
conv_weights,
|
||||
@ -583,6 +583,7 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
conv_states=conv_state,
|
||||
has_initial_state=has_initial_states_p,
|
||||
cache_indices=state_indices_tensor_p,
|
||||
metadata=mamba2_metadata,
|
||||
query_start_loc=query_start_loc_p).transpose(
|
||||
0, 1)[:num_prefill_tokens]
|
||||
|
||||
@ -593,9 +594,14 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
initial_states = None
|
||||
if (has_initial_states_p is not None and prep_initial_states):
|
||||
# making a copy of the states
|
||||
initial_states = torch.where(
|
||||
has_initial_states_p[:, None, None, None],
|
||||
ssm_state[state_indices_tensor_p], 0)
|
||||
if envs.VLLM_USE_V1:
|
||||
initial_states = torch.where(
|
||||
has_initial_states_p[:, None, None, None],
|
||||
ssm_state[state_indices_tensor_p], 0)
|
||||
else:
|
||||
initial_states = torch.where(
|
||||
has_initial_states_p[:num_prefills, None, None, None],
|
||||
ssm_state[state_indices_tensor_p], 0)
|
||||
|
||||
scan_output, varlen_state = mamba_chunk_scan_combined(
|
||||
hidden_states_p.view(1, num_prefill_tokens,
|
||||
|
||||
@ -55,7 +55,6 @@ def _causal_conv1d_fwd_kernel( # continuous batching
|
||||
IS_CONTINUOUS_BATCHING: tl.constexpr,
|
||||
USE_PAD_SLOT: tl.constexpr,
|
||||
NP2_STATELEN: tl.constexpr,
|
||||
DECODE_SEQLEN: tl.constexpr,
|
||||
BLOCK_M: tl.constexpr,
|
||||
BLOCK_N: tl.constexpr,
|
||||
):
|
||||
@ -416,7 +415,7 @@ def causal_conv1d_fn(
|
||||
activation = "silu"
|
||||
|
||||
args = None
|
||||
out = torch.zeros_like(x)
|
||||
out = torch.empty_like(x)
|
||||
if metadata is not None:
|
||||
cu_seqlen = metadata.cu_seqlen
|
||||
nums_dict = metadata.nums_dict
|
||||
@ -607,7 +606,6 @@ def causal_conv1d_fn(
|
||||
IS_CONTINUOUS_BATCHING=cache_indices is not None,
|
||||
USE_PAD_SLOT=pad_slot_id is not None,
|
||||
NP2_STATELEN=np2_statelen,
|
||||
DECODE_SEQLEN=1,
|
||||
#launch_cooperative_grid=True
|
||||
BLOCK_M=8,
|
||||
BLOCK_N=256,
|
||||
@ -665,7 +663,8 @@ def _causal_conv1d_update_kernel(
|
||||
|
||||
if IS_CONTINUOUS_BATCHING:
|
||||
# mask = idx_seq < batch
|
||||
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq)
|
||||
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
|
||||
tl.int64)
|
||||
else:
|
||||
conv_state_batch_coord = idx_seq
|
||||
if USE_PAD_SLOT: # noqa
|
||||
|
||||
@ -722,7 +722,8 @@ class SupportsTranscription(Protocol):
|
||||
|
||||
@classmethod
|
||||
def get_generation_prompt(cls, audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig, language: str,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig, language: str,
|
||||
task_type: str,
|
||||
request_prompt: str) -> PromptType:
|
||||
"""Get the prompt for the ASR model.
|
||||
|
||||
@ -231,6 +231,7 @@ _MULTIMODAL_MODELS = {
|
||||
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
||||
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
|
||||
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
|
||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
|
||||
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
|
||||
|
||||
691
vllm/model_executor/models/voxtral.py
Normal file
691
vllm/model_executor/models/voxtral.py
Normal file
@ -0,0 +1,691 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
from math import ceil
|
||||
from typing import Optional, Union, cast
|
||||
|
||||
import numpy as np
|
||||
import regex as re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from mistral_common.audio import mel_filter_bank
|
||||
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
|
||||
TextChunk, UserMessage)
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.protocol.transcription.request import TranscriptionRequest
|
||||
from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
|
||||
from transformers import TensorType, WhisperConfig
|
||||
from transformers.tokenization_utils_base import TextInput
|
||||
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models import SupportsPP
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.whisper import (
|
||||
WhisperEncoder, WhisperForConditionalGeneration)
|
||||
# yapf: enable
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, MultiModalHashes,
|
||||
PromptReplacement, PromptUpdate)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.tokenizer import (MistralTokenizer,
|
||||
cached_tokenizer_from_config)
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
|
||||
SupportsTranscription)
|
||||
from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
|
||||
merge_multimodal_embeddings)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class VoxtralProcessorAdapter:
|
||||
"""
|
||||
Provide a HF-compatible interface for
|
||||
:class:`mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: MistralTokenizer) -> None:
|
||||
super().__init__()
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
@cached_property
|
||||
def _audio_processor(self) -> AudioEncoder:
|
||||
audio_encoder = self.tokenizer.instruct.audio_encoder
|
||||
assert isinstance(audio_encoder, AudioEncoder)
|
||||
return audio_encoder
|
||||
|
||||
@cached_property
|
||||
def audio_token_id(self) -> int:
|
||||
return self._audio_processor.special_ids.audio
|
||||
|
||||
@cached_property
|
||||
def begin_audio_token_id(self) -> int:
|
||||
return self._audio_processor.special_ids.begin_audio
|
||||
|
||||
# @cached_property
|
||||
# def begin_transcript_token_id(self) -> int:
|
||||
# return self._audio_processor.special_ids.begin_transcript
|
||||
|
||||
# @cached_property
|
||||
# def end_transcript_token_id(self) -> int:
|
||||
# return self._audio_processor.special_ids.end_transcript
|
||||
|
||||
@cached_property
|
||||
def sampling_rate(self) -> int:
|
||||
return self._audio_processor.audio_config.sampling_rate
|
||||
|
||||
@cached_property
|
||||
def frame_rate(self) -> float:
|
||||
return self._audio_processor.audio_config.frame_rate
|
||||
|
||||
def get_num_audio_tokens(
|
||||
self,
|
||||
audio_length: int,
|
||||
) -> int:
|
||||
pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
|
||||
audio_length, self.sampling_rate)
|
||||
return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Optional[Union[TextInput, list[TextInput]]] = None,
|
||||
audios: Optional[Union[np.ndarray, list[np.ndarray]]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
**kwargs,
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
if text is None:
|
||||
text = []
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
if audios is None:
|
||||
audios = []
|
||||
if not isinstance(audios, list):
|
||||
audios = [audios]
|
||||
|
||||
if not audios:
|
||||
input_ids = self.tokenizer(text).input_ids
|
||||
return {"input_ids": torch.tensor(input_ids)}
|
||||
|
||||
# Allow dummy text, which is used for profiling as well as token inputs
|
||||
if any(len(t) > 0 for t in text):
|
||||
raise ValueError(
|
||||
"You've passed text inputs instead of token inputs. "
|
||||
"Make sure to process your input via `mistral_common`'s "
|
||||
"tokenizer or pass a chat completion request. "
|
||||
"For more info, see: "
|
||||
"https://github.com/vllm-project/vllm/issues/8411.")
|
||||
|
||||
audios_tokens = list[torch.Tensor]()
|
||||
audios_processed = list[torch.Tensor]()
|
||||
for audio in audios:
|
||||
assert isinstance(audio, np.ndarray)
|
||||
assert audio.ndim == 1
|
||||
|
||||
# pad if necessary
|
||||
audio = self._audio_processor.pad(audio, self.sampling_rate)
|
||||
|
||||
audio_tokens = [
|
||||
self.begin_audio_token_id
|
||||
] + [self.audio_token_id] * self.get_num_audio_tokens(len(audio))
|
||||
|
||||
audios_tokens.append(torch.tensor(audio_tokens))
|
||||
audios_processed.append(torch.tensor(audio))
|
||||
|
||||
return {
|
||||
"input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1),
|
||||
"audio_arrays": audios_processed,
|
||||
}
|
||||
|
||||
|
||||
class VoxtralProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_tokenizer(self) -> MistralTokenizer:
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||
|
||||
return tokenizer
|
||||
|
||||
def get_hf_processor(self) -> VoxtralProcessorAdapter:
|
||||
return VoxtralProcessorAdapter(self.get_tokenizer())
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"audio": 5} # Performance tends to degrade after 5
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"audio": self.get_max_audio_tokens()}
|
||||
|
||||
def get_max_audio_tokens(self) -> int:
|
||||
return self.ctx.model_config.max_model_len
|
||||
|
||||
def get_max_audio_array_len(self) -> int:
|
||||
processor = self.get_hf_processor()
|
||||
return self.get_max_audio_tokens() * int(
|
||||
processor.sampling_rate // processor.frame_rate)
|
||||
|
||||
|
||||
class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
|
||||
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
return ""
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
num_audios = mm_counts.get("audio", 0)
|
||||
|
||||
target_length = self.info.get_max_audio_array_len()
|
||||
|
||||
return {
|
||||
"audio":
|
||||
self._get_dummy_audios(length=target_length, num_audios=num_audios)
|
||||
}
|
||||
|
||||
def get_dummy_processor_inputs(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> ProcessorInputs:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
dummy_text = self.get_dummy_text(mm_counts)
|
||||
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
|
||||
dummy_audios = dummy_mm_data.get("audio", [])
|
||||
|
||||
audio_chunks: list[AudioChunk] = []
|
||||
format = "wav"
|
||||
for audio in dummy_audios:
|
||||
audio_item = Audio(
|
||||
audio_array=audio,
|
||||
sampling_rate=self.info.get_hf_processor().sampling_rate,
|
||||
format=format,
|
||||
)
|
||||
chunk = AudioChunk(input_audio=RawAudio.from_audio(audio_item))
|
||||
audio_chunks.append(chunk)
|
||||
|
||||
request = ChatCompletionRequest(messages=[
|
||||
UserMessage(content=[TextChunk(text=dummy_text), *audio_chunks]),
|
||||
])
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
dummy_tokens = res.tokens
|
||||
# whixtral tokenizer adds padding to the audio
|
||||
# so we need to update the audio arrays
|
||||
dummy_mm_data["audio"] = [a.audio_array for a in res.audios]
|
||||
|
||||
return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data)
|
||||
|
||||
|
||||
class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
|
||||
):
|
||||
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: Mapping[str, NestedTensors],
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(audio_arrays=MultiModalFieldConfig.batched("audio"))
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
audio_id = processor.audio_token_id
|
||||
|
||||
def get_replacement(item_idx: int):
|
||||
audios = mm_items.get_items("audio", AudioProcessorItems)
|
||||
audio_len = audios.get_audio_length(item_idx)
|
||||
|
||||
nb_audio_tokens = processor.get_num_audio_tokens(audio_len)
|
||||
|
||||
return [audio_id] * nb_audio_tokens
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="audio",
|
||||
target="", # Never match the prompt (see below note)
|
||||
replacement=get_replacement,
|
||||
),
|
||||
]
|
||||
|
||||
def _cached_apply_hf_processor(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
prompt_ids, mm_kwargs, mm_hashes, _ = super(
|
||||
)._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
# NOTE: The tokens are already inserted by the chat template
|
||||
return prompt_ids, mm_kwargs, mm_hashes, True
|
||||
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
sampling_rate = self.info.get_hf_processor().sampling_rate
|
||||
return MultiModalDataParser(target_sr=sampling_rate)
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(VoxtralMultiModalProcessor,
|
||||
info=VoxtralProcessingInfo,
|
||||
dummy_inputs=VoxtralDummyInputsBuilder)
|
||||
class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
SupportsPP, SupportsTranscription):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
self.config = config
|
||||
self.downsample_factor = self.config.audio_config.downsample_factor
|
||||
|
||||
self.language_model = init_vllm_registered_model(
|
||||
vllm_config=vllm_config,
|
||||
hf_config=config.text_config,
|
||||
prefix=maybe_prefix(prefix, "language_model"),
|
||||
)
|
||||
self.whisper_encoder = VoxtralEncoderModel(
|
||||
vllm_config.with_hf_config(config.audio_config),
|
||||
prefix=maybe_prefix(prefix, "whisper_encoder"),
|
||||
)
|
||||
self.audio_language_adapter = AudioLanguageAdapter(
|
||||
hidden_size=config.audio_config.d_model * self.downsample_factor,
|
||||
dim=config.text_config.hidden_size,
|
||||
)
|
||||
|
||||
def get_language_model(self) -> torch.nn.Module:
|
||||
return self.language_model
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs: object,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
# NOTE: In v1, inputs_embeds is always generated at model runner, this
|
||||
# condition is for v0 compatibility.
|
||||
elif inputs_embeds is None:
|
||||
audio_embeddings = self.get_multimodal_embeddings(**kwargs)
|
||||
inputs_embeds = self.get_input_embeddings(input_ids,
|
||||
audio_embeddings)
|
||||
input_ids = None
|
||||
|
||||
hidden_states = self.language_model.model(input_ids,
|
||||
positions,
|
||||
intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
return hidden_states
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs
|
||||
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...],
|
||||
None]:
|
||||
audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
|
||||
if audio_inputs is None:
|
||||
return None
|
||||
|
||||
audio_embeddings = self.whisper_encoder(audio_inputs)
|
||||
|
||||
for i, audio_embedding in enumerate(audio_embeddings):
|
||||
seq_len, dim = audio_embedding.shape
|
||||
# Pad such that seq_len is divisible by downsample_factor
|
||||
target_seq_len = self.downsample_factor * math.ceil(
|
||||
seq_len / self.downsample_factor)
|
||||
audio_embedding = torch.nn.functional.pad(
|
||||
audio_embedding,
|
||||
(0, 0, 0, target_seq_len - seq_len),
|
||||
)
|
||||
audio_embeddings[i] = audio_embedding.reshape(
|
||||
target_seq_len // self.downsample_factor,
|
||||
dim * self.downsample_factor)
|
||||
|
||||
# Concat, project and resplit
|
||||
audio_embeddings_packed = torch.cat(audio_embeddings, dim=0)
|
||||
audio_embeddings_packed = self.audio_language_adapter(
|
||||
audio_embeddings_packed)
|
||||
audio_embeddings = torch.split(audio_embeddings_packed,
|
||||
[a.shape[0] for a in audio_embeddings],
|
||||
dim=0)
|
||||
|
||||
return audio_embeddings
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
audio_encoder = self.tokenizer.instruct.audio_encoder
|
||||
audio_tok_id = audio_encoder.audio_token
|
||||
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, multimodal_embeddings, audio_tok_id)
|
||||
return inputs_embeds
|
||||
|
||||
def _parse_and_validate_audio_arrays(
|
||||
self, **kwargs: object) -> Union[list[torch.Tensor], None]:
|
||||
audio_arrays = kwargs.pop("audio_arrays", None)
|
||||
if audio_arrays is None:
|
||||
return None
|
||||
|
||||
if not isinstance(audio_arrays, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of audio_arrays. "
|
||||
f"Got type: {type(audio_arrays)}")
|
||||
|
||||
audio_arrays = flatten_bn(audio_arrays)
|
||||
if isinstance(audio_arrays, torch.Tensor):
|
||||
audio_arrays = list(audio_arrays.unbind(0))
|
||||
return audio_arrays
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[torch.Tensor]:
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(cls, model_config: ModelConfig,
|
||||
task_type: str) -> SpeechToTextConfig:
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||
max_audio_clip_s = audio_config.chunk_length_s
|
||||
sample_rate = audio_config.sampling_rate
|
||||
return SpeechToTextConfig(
|
||||
max_audio_clip_s=max_audio_clip_s,
|
||||
sample_rate=sample_rate,
|
||||
# mistral_common and whisper encoder take care of chunking
|
||||
min_energy_split_window_size=None,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
# for speech-to-text transcription
|
||||
def get_generation_prompt(cls, audio: np.ndarray,
|
||||
model_config: ModelConfig,
|
||||
stt_config: SpeechToTextConfig, language: str,
|
||||
task_type: str,
|
||||
request_prompt: str) -> PromptType:
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
audio = Audio(audio, int(stt_config.sample_rate),
|
||||
format="wav") # lossless
|
||||
req = TranscriptionRequest(model=model_config.model,
|
||||
audio=RawAudio.from_audio(audio),
|
||||
language=language)
|
||||
|
||||
tokenized = tokenizer.instruct.encode_transcription(req)
|
||||
audio = (tokenized.audios[0].audio_array, stt_config.sample_rate)
|
||||
prompts_dict = {"multi_modal_data": {"audio": audio}}
|
||||
prompts_dict["prompt_token_ids"] = tokenized.tokens
|
||||
return cast(PromptType, prompts_dict)
|
||||
|
||||
@classmethod
|
||||
def validate_language(cls, language: str) -> bool:
|
||||
# same as whisper
|
||||
return WhisperForConditionalGeneration.validate_language(language)
|
||||
|
||||
@classmethod
|
||||
def get_num_audio_tokens(cls, audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig) -> Optional[int]:
|
||||
"""
|
||||
Map from audio duration to number of audio tokens produced by the ASR
|
||||
model, without running a forward pass.
|
||||
This is used for estimating the amount of processing for this audio.
|
||||
"""
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
adapter = VoxtralProcessorAdapter(tokenizer)
|
||||
return adapter.get_num_audio_tokens(
|
||||
int(audio_duration_s * stt_config.sample_rate))
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
# fmt: off
|
||||
remapping_rules = [
|
||||
(r"mm_whisper_embeddings\.(.*)", r"\1"),
|
||||
(r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
|
||||
(r"audio_language_adapter\.0\.weight", r"audio_language_adapter.w_in.weight"), # noqa: E501
|
||||
(r"audio_language_adapter\.2\.weight", r"audio_language_adapter.w_out.weight"), # noqa: E501
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
audio_params = dict(
|
||||
nn.ModuleDict({
|
||||
"audio_language_adapter":
|
||||
self.audio_language_adapter,
|
||||
}).named_parameters())
|
||||
|
||||
loaded_weights = set()
|
||||
|
||||
def llm_weights_generator():
|
||||
nonlocal loaded_weights
|
||||
for name, w in weights:
|
||||
is_encoder = (
|
||||
name.startswith("mm_whisper_embeddings") and
|
||||
not name.startswith("mm_whisper_embeddings.tok_embeddings")
|
||||
and not name.startswith(
|
||||
"mm_whisper_embeddings.audio_language_projection"))
|
||||
|
||||
for pattern, repl in remapping_rules:
|
||||
if re.fullmatch(pattern, name):
|
||||
name = re.sub(pattern, repl, name)
|
||||
|
||||
if is_encoder:
|
||||
name = self.whisper_encoder.load_weight((name, w))
|
||||
loaded_weights.add(f"whisper_encoder.{name}")
|
||||
continue
|
||||
|
||||
if name in audio_params:
|
||||
param = audio_params[name]
|
||||
with torch.no_grad():
|
||||
default_weight_loader(param, w)
|
||||
loaded_weights.add(name)
|
||||
else:
|
||||
yield (name, w)
|
||||
|
||||
for name in self.language_model.load_weights(llm_weights_generator()):
|
||||
loaded_weights.add(f"language_model.{name}")
|
||||
|
||||
# potentially manually add position embeddings
|
||||
sin_key = "whisper_encoder.whisper_encoder.embed_positions.weight"
|
||||
if sin_key not in loaded_weights:
|
||||
# make sure we don't hit an error here
|
||||
loaded_weights.add(sin_key)
|
||||
|
||||
return loaded_weights
|
||||
|
||||
|
||||
class AudioLanguageAdapter(nn.Module):
|
||||
|
||||
def __init__(self, hidden_size: int, dim: int) -> None:
|
||||
super().__init__()
|
||||
self.w_in = nn.Linear(hidden_size, dim, bias=False)
|
||||
self.gelu = nn.GELU()
|
||||
self.w_out = nn.Linear(dim, dim, bias=False)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.w_out(self.gelu(self.w_in(x)))
|
||||
|
||||
|
||||
class VoxtralEncoderModel(nn.Module):
|
||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||
|
||||
# fmt: off
|
||||
mistral_remapping = [
|
||||
(r"whisper_encoder\.conv_layers\.0\.(weight|bias)", r"whisper_encoder.conv1.\1"), # noqa: E501
|
||||
(r"whisper_encoder\.conv_layers\.1\.(weight|bias)", r"whisper_encoder.conv2.\1"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.\2_proj.\3"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn.out_proj.\2"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)", r"whisper_encoder.layers.\1.self_attn_layer_norm.\2"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc1.\2"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)", r"whisper_encoder.layers.\1.mlp.fc2.\2"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)", r"whisper_encoder.layers.\1.final_layer_norm.\2"), # noqa: E501
|
||||
(r"whisper_encoder\.transformer\.norm\.(weight|bias)", r"whisper_encoder.layer_norm.\1"), # noqa: E501
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
*,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = cast(WhisperConfig, vllm_config.model_config.hf_config)
|
||||
self.dtype: torch.dtype = vllm_config.model_config.dtype
|
||||
self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(
|
||||
prefix, "whisper_encoder"),
|
||||
is_standalone_encoder=True,
|
||||
init_in_fp32=True)
|
||||
mel_filters = mel_filter_bank(
|
||||
num_frequency_bins=1 + self.config.window_size // 2,
|
||||
num_mel_bins=self.config.num_mel_bins,
|
||||
min_frequency=0.0,
|
||||
max_frequency=8000.0,
|
||||
sampling_rate=self.config.sampling_rate,
|
||||
)
|
||||
self.mel_filters = torch.tensor(mel_filters, dtype=torch.float32)
|
||||
|
||||
def compute_whisper_melspec(
|
||||
self,
|
||||
audio_waveforms: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
input_dtype = audio_waveforms.dtype
|
||||
window = torch.hann_window(self.config.window_size).to(
|
||||
audio_waveforms.device)
|
||||
stft = torch.stft(
|
||||
audio_waveforms,
|
||||
self.config.window_size,
|
||||
self.config.hop_length,
|
||||
window=window,
|
||||
return_complex=True,
|
||||
)
|
||||
magnitudes = stft[..., :-1].abs()**2
|
||||
mel_spec = self.mel_filters.T @ magnitudes
|
||||
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
|
||||
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
|
||||
log_spec = (log_spec + 4.0) / 4.0
|
||||
return log_spec.to(input_dtype)
|
||||
|
||||
@property
|
||||
def downsample_factor(self) -> int:
|
||||
return self.whisper_encoder.conv1.stride[
|
||||
0] * self.whisper_encoder.conv2.stride[0]
|
||||
|
||||
@property
|
||||
def chunk_size(self) -> int:
|
||||
return self.config.max_source_positions * self.downsample_factor
|
||||
|
||||
def prepare_inputs_for_conv(
|
||||
self,
|
||||
audio_waveforms: list[torch.Tensor],
|
||||
) -> tuple[torch.Tensor, list[int]]:
|
||||
assert isinstance(audio_waveforms, list)
|
||||
# list[num_mel_bins, seq_len]
|
||||
input_features = [
|
||||
self.compute_whisper_melspec(audio).to(self.dtype)
|
||||
for audio in audio_waveforms
|
||||
]
|
||||
|
||||
chunked_features: list[torch.Tensor] = []
|
||||
chunks_per_example: list[int] = []
|
||||
for feature in input_features:
|
||||
chunks = feature.split(self.chunk_size, dim=-1)
|
||||
chunked_features += chunks
|
||||
chunks_per_example.append(len(chunks))
|
||||
|
||||
# [total_num_chunks, num_mel_bins, chunk_size]
|
||||
return torch.stack(chunked_features), chunks_per_example
|
||||
|
||||
def forward(
|
||||
self, input_features: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> list[torch.Tensor]:
|
||||
if not isinstance(input_features, list):
|
||||
input_features = [input_features]
|
||||
|
||||
# Split long inputs into chunks
|
||||
input_embeds, chunks_per_example = (
|
||||
self.prepare_inputs_for_conv(input_features))
|
||||
|
||||
# [total_num_chunks, ceil(chunk_size / downsample_factor), hidden_size]
|
||||
out = self.whisper_encoder([input_embeds])
|
||||
|
||||
# Re-concatenate the chunks
|
||||
chunk_idx = 0
|
||||
results = []
|
||||
for n_chunks in chunks_per_example:
|
||||
result = out[chunk_idx:chunk_idx + n_chunks].flatten(0, 1)
|
||||
results.append(result)
|
||||
chunk_idx += n_chunks
|
||||
|
||||
return results
|
||||
|
||||
def load_weight(self, weight: tuple[str, torch.Tensor]) -> str:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
|
||||
name, loaded_weight = weight
|
||||
for pattern, repl in self.mistral_remapping:
|
||||
if re.fullmatch(pattern, name):
|
||||
name = re.sub(pattern, repl, name)
|
||||
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
return name
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from contextlib import nullcontext
|
||||
from typing import Optional, TypedDict, Union, cast
|
||||
|
||||
import numpy as np
|
||||
@ -13,6 +14,7 @@ from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
|
||||
from transformers.models.whisper.modeling_whisper import sinusoids
|
||||
|
||||
from vllm.attention import Attention, AttentionType
|
||||
from vllm.attention.layer import MultiHeadAttention
|
||||
from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
|
||||
VllmConfig)
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
@ -26,6 +28,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
|
||||
@ -178,6 +181,7 @@ class WhisperAttention(nn.Module):
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
standalone_encoder: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
@ -213,16 +217,24 @@ class WhisperAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.out_proj",
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
self.scaling,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
attn_type=self.attn_type,
|
||||
)
|
||||
if standalone_encoder:
|
||||
self.attn = MultiHeadAttention(
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
self.scaling,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
)
|
||||
else:
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
self.scaling,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
attn_type=self.attn_type,
|
||||
)
|
||||
|
||||
def _init_qkv(
|
||||
self,
|
||||
@ -357,7 +369,11 @@ class WhisperMLP(nn.Module):
|
||||
|
||||
class WhisperEncoderLayer(nn.Module):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
def __init__(self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
is_standalone_encoder: bool = False):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
@ -371,6 +387,7 @@ class WhisperEncoderLayer(nn.Module):
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
standalone_encoder=is_standalone_encoder,
|
||||
)
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.mlp = WhisperMLP(
|
||||
@ -462,10 +479,16 @@ class WhisperDecoderLayer(nn.Module):
|
||||
|
||||
class WhisperEncoder(nn.Module):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
def __init__(self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
is_standalone_encoder: bool = False,
|
||||
init_in_fp32: bool = False):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
embed_dim = config.d_model
|
||||
self.is_standalone_encoder = is_standalone_encoder
|
||||
self.num_mel_bins = config.num_mel_bins
|
||||
self.max_source_positions = config.max_source_positions
|
||||
self.embed_scale = (math.sqrt(embed_dim)
|
||||
@ -480,17 +503,25 @@ class WhisperEncoder(nn.Module):
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1)
|
||||
self.embed_positions = nn.Embedding(self.max_source_positions,
|
||||
embed_dim)
|
||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||
config.encoder_layers,
|
||||
lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
|
||||
prefix=f"{prefix}.layers"),
|
||||
prefix=f"{prefix}.layers",
|
||||
is_standalone_encoder=
|
||||
is_standalone_encoder),
|
||||
prefix=f"{prefix}.layers",
|
||||
)
|
||||
self.layer_norm = nn.LayerNorm(config.d_model)
|
||||
|
||||
with torch.no_grad():
|
||||
maybe_fp32_init_ctx = set_default_torch_dtype(
|
||||
torch.float32) if init_in_fp32 else nullcontext()
|
||||
|
||||
with (
|
||||
torch.no_grad(),
|
||||
maybe_fp32_init_ctx,
|
||||
):
|
||||
self.embed_positions = nn.Embedding(self.max_source_positions,
|
||||
embed_dim)
|
||||
self.embed_positions.weight.copy_(
|
||||
sinusoids(*self.embed_positions.weight.shape))
|
||||
|
||||
@ -499,8 +530,10 @@ class WhisperEncoder(nn.Module):
|
||||
for features in input_features:
|
||||
embeds = nn.functional.gelu(self.conv1(features))
|
||||
embeds = nn.functional.gelu(self.conv2(embeds))
|
||||
embeds = embeds.permute(1, 0)
|
||||
embeds = embeds + self.embed_positions.weight[:embeds.size(0), :]
|
||||
embeds = embeds.transpose(-1, -2)
|
||||
embeds = (embeds +
|
||||
self.embed_positions.weight[:embeds.size(-2), :]).to(
|
||||
embeds.dtype)
|
||||
hidden_states.append(embeds)
|
||||
hidden_states = torch.cat(hidden_states)
|
||||
|
||||
@ -792,10 +825,14 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
|
||||
f"or {list(ISO639_1_OTHER_LANGS.values())}")
|
||||
|
||||
@classmethod
|
||||
def get_generation_prompt(cls, audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig, language: str,
|
||||
task_type: str,
|
||||
request_prompt: str) -> PromptType:
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
model_config: ModelConfig, # not needed here
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str,
|
||||
task_type: str,
|
||||
request_prompt: str) -> PromptType:
|
||||
prompt = {
|
||||
"encoder_prompt": {
|
||||
# Whisper does not support encoder prompt.
|
||||
|
||||
@ -9,7 +9,6 @@ from typing import Annotated, Any, Optional, Union
|
||||
|
||||
import msgspec
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logits_process import LogitsProcessor
|
||||
@ -84,27 +83,6 @@ class GuidedDecodingParams:
|
||||
"You can only use one kind of guided decoding but multiple are "
|
||||
f"specified: {self.__dict__}")
|
||||
|
||||
if self.backend is not None and ":" in self.backend:
|
||||
self._extract_backend_options()
|
||||
|
||||
@deprecated(
|
||||
"Passing guided decoding backend options inside backend in the format "
|
||||
"'backend:...' is deprecated. This will be removed in v0.10.0. Please "
|
||||
"use the dedicated arguments '--disable-fallback', "
|
||||
"'--disable-any-whitespace' and '--disable-additional-properties' "
|
||||
"instead.")
|
||||
def _extract_backend_options(self):
|
||||
"""Extract backend options from the backend string."""
|
||||
assert isinstance(self.backend, str)
|
||||
self.backend, options = self.backend.split(":")
|
||||
options_set = set(options.strip().split(","))
|
||||
if "no-fallback" in options_set:
|
||||
self.disable_fallback = True
|
||||
if "disable-any-whitespace" in options_set:
|
||||
self.disable_any_whitespace = True
|
||||
if "no-additional-properties" in options_set:
|
||||
self.disable_additional_properties = True
|
||||
|
||||
|
||||
class RequestOutputKind(Enum):
|
||||
# Return entire output so far in every RequestOutput
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers import PretrainedConfig, WhisperConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -24,9 +24,21 @@ def adapt_config_dict(config_dict: dict[str, Any],
|
||||
|
||||
if bool(config_dict.get("yarn")):
|
||||
config_dict = _remap_mistral_yarn_args(config_dict)
|
||||
if bool((config_dict.get("multimodal") or {}).get("vision_encoder_args")
|
||||
or config_dict.get("vision_encoder")):
|
||||
|
||||
is_vision = ((config_dict.get("multimodal")
|
||||
or {}).get("vision_encoder_args")
|
||||
or config_dict.get("vision_encoder"))
|
||||
is_audio = bool(
|
||||
((config_dict.get("multimodal") or {}).get("whisper_model_args")
|
||||
or {}).get("encoder_args"))
|
||||
|
||||
assert not (is_vision and is_audio), \
|
||||
"Vision and audio are mutually exclusive"
|
||||
|
||||
if is_vision:
|
||||
config_dict = _remap_mistral_vision_args(config_dict)
|
||||
if is_audio:
|
||||
config_dict = _remap_mistral_audio_args(config_dict)
|
||||
|
||||
config = PretrainedConfig.from_dict(config_dict)
|
||||
|
||||
@ -118,3 +130,35 @@ def _remap_mistral_quantization_args(config: dict) -> dict:
|
||||
config["quantization_config"] = quantization_config
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
whisper_args = config["multimodal"].pop("whisper_model_args")
|
||||
encoder_args = whisper_args["encoder_args"]
|
||||
downsample_args = whisper_args["downsample_args"]
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type":
|
||||
"whixtral",
|
||||
"architectures": ["VoxtralForConditionalGeneration"],
|
||||
"text_config":
|
||||
PretrainedConfig.from_dict(config),
|
||||
"audio_config":
|
||||
WhisperConfig(
|
||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
||||
downsample_factor=downsample_args["downsample_factor"],
|
||||
d_model=encoder_args["dim"],
|
||||
encoder_layers=encoder_args["n_layers"],
|
||||
encoder_ffn_dim=encoder_args["hidden_dim"],
|
||||
encoder_attention_heads=encoder_args["n_heads"],
|
||||
vocab_size=encoder_args["vocab_size"],
|
||||
max_source_positions=encoder_args["max_source_positions"],
|
||||
)
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
|
||||
@ -78,7 +78,12 @@ class KVCacheManager:
|
||||
) -> None:
|
||||
self.max_model_len = max_model_len
|
||||
|
||||
if len(kv_cache_config.kv_cache_groups) == 0:
|
||||
# Attention free models don't have kv cache,
|
||||
# thus don't need prefix caching.
|
||||
enable_caching = False
|
||||
self.enable_caching = enable_caching
|
||||
|
||||
self.caching_hash_fn = (
|
||||
sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
|
||||
sha256 if caching_hash_algo == "sha256" else hash)
|
||||
@ -101,7 +106,7 @@ class KVCacheManager:
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_model_len=self.max_model_len,
|
||||
use_eagle=self.use_eagle,
|
||||
enable_caching=enable_caching,
|
||||
enable_caching=self.enable_caching,
|
||||
caching_hash_fn=self.caching_hash_fn,
|
||||
enable_kv_cache_events=enable_kv_cache_events,
|
||||
)
|
||||
|
||||
@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
||||
ValueError: If there is not enough memory available for the KV cache.
|
||||
"""
|
||||
|
||||
# No need to check for available memory if the kv_cache_spec is empty
|
||||
if not kv_cache_spec:
|
||||
return
|
||||
|
||||
if available_memory <= 0:
|
||||
raise ValueError("No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
|
||||
return len(page_sizes) == 1
|
||||
|
||||
|
||||
def is_kv_cache_type_attention_free(
|
||||
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||
|
||||
# kv_cache_spec is an empty dict for attention free models
|
||||
return not kv_cache_spec
|
||||
|
||||
|
||||
def _get_kv_cache_config_uniform_page_size(
|
||||
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
return kv_cache_config
|
||||
|
||||
|
||||
def _get_kv_cache_config_attention_free() -> KVCacheConfig:
|
||||
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
|
||||
|
||||
|
||||
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
||||
"""
|
||||
This function tries to convert the KV cache specs to one type if the model
|
||||
@ -957,7 +972,11 @@ def get_kv_cache_config(
|
||||
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
|
||||
unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
|
||||
if is_kv_cache_type_uniform(kv_cache_spec):
|
||||
if is_kv_cache_type_attention_free(kv_cache_spec):
|
||||
# This returns a kv_cache config with 0 kv_cache groups and 1 block
|
||||
# to allow for the KVCache manager to handle attention free models.
|
||||
return _get_kv_cache_config_attention_free()
|
||||
elif is_kv_cache_type_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
|
||||
@ -139,7 +139,13 @@ class EngineCore:
|
||||
|
||||
# Profiles the peak memory usage of the model to determine how much
|
||||
# memory can be allocated for kv cache.
|
||||
available_gpu_memory = self.model_executor.determine_available_memory()
|
||||
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
|
||||
if has_kv_cache:
|
||||
available_gpu_memory = \
|
||||
self.model_executor.determine_available_memory()
|
||||
else:
|
||||
# Attention free models don't need memory for kv cache
|
||||
available_gpu_memory = [0] * len(kv_cache_specs)
|
||||
|
||||
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||
# Get the kv cache tensor size
|
||||
|
||||
Reference in New Issue
Block a user