From 61fed92c7e646d6f2ec5d9de54568a860870e6a4 Mon Sep 17 00:00:00 2001 From: ZincCat <52513999+zinccat@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:02:34 -0800 Subject: [PATCH 001/309] [Bugfix] Fix ColumnParallelLinearWithLoRA slice (#11708) Signed-off-by: ZincCat --- vllm/lora/layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 85164c2165..102e40d3f4 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -479,7 +479,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): # ColumnParallelLinear. else: tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_dim + shard_size = self.output_size start_idx = tensor_model_parallel_rank * shard_size end_idx = (tensor_model_parallel_rank + 1) * shard_size lora_b = lora_b[:, start_idx:end_idx] @@ -490,7 +490,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): if bias is None: return bias tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_dim + shard_size = self.output_size start_idx = tensor_model_parallel_rank * shard_size end_idx = (tensor_model_parallel_rank + 1) * shard_size bias = bias[start_idx:end_idx] From 1543914c04697fb252e4468b7c9d14be512b050a Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:29:11 -0500 Subject: [PATCH 002/309] [V1] Improve TP>1 Error Handling + Stack Trace (#11721) Co-authored-by: Tyler Michael Smith --- vllm/v1/engine/async_llm.py | 16 ---------------- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/core_client.py | 19 ++++++++++++++++++- vllm/v1/executor/multiproc_executor.py | 24 +++++++++++++++++++++--- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ff7a0c28dd..564d8a8343 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,6 +1,5 @@ import asyncio import os -import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -42,21 +41,6 @@ class AsyncLLM(EngineClient): start_engine_loop: bool = True, ) -> None: - # The child processes will send SIGQUIT when unrecoverable - # errors happen. We kill the process tree here so that the - # stack trace is very evident. - # TODO: rather than killing the main process, we should - # figure out how to raise an AsyncEngineDeadError and - # handle at the API server level so we can return a better - # error code to the clients calling VLLM. - def sigquit_handler(signum, frame): - logger.fatal( - "AsyncLLM got SIGQUIT from worker processes, shutting " - "down. See stack trace above for root cause issue.") - kill_process_tree(os.getpid()) - - signal.signal(signal.SIGQUIT, sigquit_handler) - assert start_engine_loop self.log_requests = log_requests diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 13a50a4f85..975ce11fe8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -198,7 +198,7 @@ class EngineCoreProc(EngineCore): except Exception: traceback = get_exception_traceback() logger.error("EngineCore hit an exception: %s", traceback) - parent_process.send_signal(signal.SIGQUIT) + parent_process.send_signal(signal.SIGUSR1) finally: if engine_core is not None: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e009f3448b..6a40c961fc 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,5 @@ +import os +import signal import weakref from abc import ABC, abstractmethod from typing import List, Type @@ -8,7 +10,8 @@ import zmq.asyncio from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket +from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, + make_zmq_socket) from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) @@ -134,6 +137,20 @@ class MPClient(EngineCoreClient): executor_class: Type[Executor], log_stats: bool = False, ): + # The child processes will send SIGUSR1 when unrecoverable + # errors happen. We kill the process tree here so that the + # stack trace is very evident. + # TODO(rob): rather than killing the main process, we should + # figure out how to raise an AsyncEngineDeadError and + # handle at the API server level so we can return a better + # error code to the clients calling VLLM. + def sigusr1_handler(signum, frame): + logger.fatal("Got fatal signal from worker processes, shutting " + "down. See stack trace above for root cause issue.") + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGUSR1, sigusr1_handler) + # Serialization setup. self.encoder = PickleEncoder() self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ed64e77413..114deae980 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -9,6 +9,7 @@ from enum import Enum, auto from multiprocessing.process import BaseProcess from typing import Any, Dict, List, Optional, Tuple +import psutil import zmq from vllm.config import VllmConfig @@ -38,6 +39,19 @@ class MultiprocExecutor(Executor): # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) + # The child processes will send SIGUSR1 when unrecoverable + # errors happen. + def sigusr1_handler(signum, frame): + logger.fatal( + "MulitprocExecutor got fatal signal from worker processes, " + "shutting down. See stack trace above for root cause issue.") + # Propagate error up to parent process. + parent_process = psutil.Process().parent() + parent_process.send_signal(signal.SIGUSR1) + self.shutdown() + + signal.signal(signal.SIGUSR1, sigusr1_handler) + self.vllm_config = vllm_config self.parallel_config = vllm_config.parallel_config @@ -335,8 +349,11 @@ class WorkerProc: except SystemExit: logger.debug("Worker interrupted.") - except BaseException as e: - logger.exception(e) + except Exception: + # worker_busy_loop sends exceptions exceptons to Executor + # for shutdown, but if there is an error in startup or an + # error with IPC itself, we need to alert the parent. + psutil.Process().parent().send_signal(signal.SIGUSR1) raise finally: @@ -377,9 +394,10 @@ class WorkerProc: try: output = getattr(self.worker, method)(*args, **kwargs) - except BaseException as e: + except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) + logger.exception("WorkerProc hit an exception: %s", exc_info=e) continue self.worker_response_mq.enqueue( From a655eb30252fe266ce16fde2aa9f8f9554ccd46e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 4 Jan 2025 06:19:02 +0800 Subject: [PATCH 003/309] [Misc]Add BNB quantization for Qwen2VL (#11719) Signed-off-by: Jee Jee Li Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/qwen2_vl.py | 69 +++++++++++++++----------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 26b6d768ad..5a8c6e4deb 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -38,7 +38,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.distributed import parallel_state +from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata @@ -239,6 +239,8 @@ class Qwen2VisionAttention(nn.Module): super().__init__() # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_size = world_size + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( @@ -261,24 +263,41 @@ class Qwen2VisionAttention(nn.Module): raise RuntimeError( f"Qwen2-VL does not support {self.attn_backend} backend now.") + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = tensor_model_parallel_all_gather(qkv) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] + if self.tp_size > 1: + splitter = partial(dist_utils.split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + v = splitter(v)[self.tp_rank] + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] + + # [s, b, c] --> [s, b, 3 * head * head_dim] x, _ = self.qkv(x) - # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] - new_x_shape = x.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head, - ) - x = x.view(*new_x_shape) - - # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] - q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) batch_size = q.shape[1] q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() @@ -614,24 +633,6 @@ class Qwen2VisionTransformer(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - if name.endswith("qkv.weight"): - visual_num_heads = self.num_heads - visual_embed_dim = self.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size, - visual_embed_dim) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) - elif name.endswith("qkv.bias"): - visual_num_heads = self.num_heads - visual_embed_dim = self.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1) - param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -935,6 +936,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, embedding_modules = {} embedding_padding_modules = [] + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", From bf0d97d78619b290ed273199ad3800b57b638603 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 3 Jan 2025 17:36:46 -0500 Subject: [PATCH 004/309] Update requirements-tpu.txt to support python 3.9 and 3.11 (#11695) Signed-off-by: mgoin --- requirements-tpu.txt | 4 +++- vllm/worker/tpu_model_runner.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index b8f0b15469..8ab18b3770 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -18,6 +18,8 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.6.0.dev20241126+cpu torchvision==0.20.0.dev20241126+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" jaxlib==0.4.36.dev20241122 jax==0.4.36.dev20241122 diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 9a054eb8a4..7bdb7f0e2d 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -126,8 +126,10 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): logger.warning( "The max_model_len (%d) is too large. This may degrade the " "performance due to the insufficient smem size. Consider " - "setting --max-model-len to a smaller value.", - self.model_config.max_model_len) + "setting --max-model-len to a smaller value, like %d.", + self.model_config.max_model_len, + self.model_config.max_model_len / + (block_table_size / smem_size)) def load_model(self) -> None: self.device = self.device_config.device From ad0d567e1cdc77aff435b20bac918bfd0f55db0a Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 3 Jan 2025 18:25:02 -0500 Subject: [PATCH 005/309] [V1] Chore: cruft removal (#11724) --- vllm/entrypoints/llm.py | 2 -- vllm/v1/engine/core_client.py | 2 -- vllm/v1/engine/llm_engine.py | 4 ---- vllm/v1/engine/processor.py | 3 --- 4 files changed, 11 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7c0de3b3e5..e48fd1a4fa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -225,8 +225,6 @@ class LLM: # Logic to switch between engines is done at runtime instead of import # to avoid import order issues self.engine_class = self.get_engine_class() - - # TODO(rob): enable mp by default (issue with fork vs spawn) self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 6a40c961fc..a4a45ae05f 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -94,8 +94,6 @@ class InprocClient(EngineCoreClient): * pushes EngineCoreRequest directly into the EngineCore * pulls EngineCoreOutputs by stepping the EngineCore - - TODO: support asyncio-mode for debugging. """ def __init__(self, *args, **kwargs): diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1f49de67d7..0bd9b52c9b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -42,8 +42,6 @@ class LLMEngine: use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - - # TODO: Can we avoid this? self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -179,8 +177,6 @@ class LLMEngine: return request_outputs - # TODO(rob): Can we get rid of these? - def get_model_config(self): return self.model_config diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 905d3d1fc3..c0f6cfab48 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -49,9 +49,6 @@ class Processor: cache_config.enable_prefix_caching self.mm_hasher = MMHasher() - # TODO: run in an ThreadpoolExecutor or BackgroundProcess. - # This ideally should releases the GIL, so we should not block the - # asyncio loop while this is running. def process_inputs( self, request_id: str, From e5d7ed0c5374d38e75a8ef0243cc348f0f6f9185 Mon Sep 17 00:00:00 2001 From: WangErXiao <863579016@qq.com> Date: Sat, 4 Jan 2025 08:13:12 +0800 Subject: [PATCH 006/309] [V1] log GPU blocks num for MultiprocExecutor (#11656) --- vllm/v1/executor/multiproc_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 114deae980..41e6abbd67 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -95,6 +95,7 @@ class MultiprocExecutor(Executor): Initialize the KV caches and begin the model execution loop of the underlying workers. """ + logger.info("# GPU blocks: %d", num_gpu_blocks) self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) self.collective_rpc("compile_or_warm_up_model") From 9c93636d84414591ae4d7b9c1174af7e91052fd8 Mon Sep 17 00:00:00 2001 From: Hust_YangXian Date: Sat, 4 Jan 2025 14:16:30 +0800 Subject: [PATCH 007/309] Update tool_calling.md (#11701) --- docs/source/usage/tool_calling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md index 34b26647a9..062f2021eb 100644 --- a/docs/source/usage/tool_calling.md +++ b/docs/source/usage/tool_calling.md @@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8 vllm serve meta-llama/Llama-3.1-8B-Instruct \ --enable-auto-tool-choice \ --tool-call-parser llama3_json \ - --chat-template examples/tool_chat_template_llama3_json.jinja + --chat-template examples/tool_chat_template_llama3.1_json.jinja ``` Next, make a request to the model that should result in it using the available tools: From d1d49397e7f8d1ac472d763dae395b67fdda1ef8 Mon Sep 17 00:00:00 2001 From: Alberto Ferrer Date: Sat, 4 Jan 2025 00:29:02 -0600 Subject: [PATCH 008/309] Update bnb.md with example for OpenAI (#11718) --- docs/source/quantization/bnb.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md index 8240eca1c7..f7f41726f3 100644 --- a/docs/source/quantization/bnb.md +++ b/docs/source/quantization/bnb.md @@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` +## OpenAI Compatible Server + +Append the following to your 4bit model arguments: + +``` +--quantization bitsandbytes --load-format bitsandbytes +``` From fbf25645542fdcfb3f1a27ba05486492e368925c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 4 Jan 2025 14:41:31 +0800 Subject: [PATCH 009/309] [V1] Add `RayExecutor` support for `AsyncLLM` (api server) (#11712) --- vllm/v1/engine/async_llm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 564d8a8343..0696caf883 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -22,6 +22,7 @@ from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor +from vllm.v1.executor.ray_utils import initialize_ray_cluster logger = init_logger(__name__) @@ -131,7 +132,11 @@ class AsyncLLM(EngineClient): executor_class: Type[Executor] distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "mp": + if distributed_executor_backend == "ray": + initialize_ray_cluster(vllm_config.parallel_config) + from vllm.v1.executor.ray_executor import RayExecutor + executor_class = RayExecutor + elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor else: From d91457d529c2df5d66bdfd939b90b7c75a9729b8 Mon Sep 17 00:00:00 2001 From: xcnick Date: Sat, 4 Jan 2025 14:49:46 +0800 Subject: [PATCH 010/309] [V1] Add kv cache utils tests. (#11513) Signed-off-by: xcnick --- tests/v1/core/test_kv_cache_utils.py | 241 +++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 tests/v1/core/test_kv_cache_utils.py diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py new file mode 100644 index 0000000000..faa3a91de1 --- /dev/null +++ b/tests/v1/core/test_kv_cache_utils.py @@ -0,0 +1,241 @@ +import pytest + +from vllm.inputs import token_inputs +from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, + KVCacheBlock, + generate_block_hash_extra_keys, + hash_block_tokens, + hash_request_tokens) +from vllm.v1.request import Request + + +def make_request(request_id, + prompt_token_ids, + mm_positions=None, + mm_hashes=None): + return Request( + request_id=request_id, + inputs=token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_placeholders={"image": mm_positions} + if mm_positions else None, + multi_modal_hashes=mm_hashes, + ), + sampling_params=SamplingParams(max_tokens=17), + eos_token_id=100, + arrival_time=0, + lora_request=None, + ) + + +def test_kv_cache_block(): + # Test KVCacheBlock initialization + block = KVCacheBlock(block_id=0) + assert block.block_id == 0 + assert block.ref_cnt == 0 + assert block.block_hash is None + + # Test reference count manipulation + block.incr_ref() + assert block.ref_cnt == 1 + block.decr_ref() + assert block.ref_cnt == 0 + + # Test block hash setting and resetting + block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3)) + block.block_hash = block_hash + assert block.block_hash == block_hash + + block.reset_hash() + assert block.block_hash is None + + +def test_free_kv_cache_block_queue_initialization(): + # Test with a single block + block = KVCacheBlock(block_id=0) + queue = FreeKVCacheBlockQueue([block]) + assert queue.num_free_blocks == 1 + assert queue.free_list_head == block + assert queue.free_list_tail == block + + +def test_free_kv_cache_block_queue_operations(): + # Create a list of KVCacheBlock objects + blocks = [KVCacheBlock(block_id=i) for i in range(5)] + + # Create a FreeKVCacheBlockQueue with these blocks + queue = FreeKVCacheBlockQueue(blocks) + + # Check initial state + assert queue.num_free_blocks == 5 + assert queue.free_list_head == blocks[0] + assert queue.free_list_tail == blocks[4] + + # Pop the first block + block1 = queue.popleft() + assert block1 == blocks[0] + assert queue.num_free_blocks == 4 + assert queue.free_list_head == blocks[1] + assert queue.free_list_tail == blocks[4] + + # Remove a block from the middle + block_to_remove = blocks[2] + queue.remove(block_to_remove) + assert queue.num_free_blocks == 3 + assert blocks[1].next_free_block == blocks[3] + assert blocks[3].prev_free_block == blocks[1] + + # Append a block back + queue.append(block_to_remove) + assert queue.num_free_blocks == 4 + assert queue.free_list_tail == block_to_remove + assert block_to_remove.prev_free_block == blocks[4] + assert block_to_remove.next_free_block is None + + # Pop blocks until empty + for _ in range(4): + queue.popleft() + assert queue.num_free_blocks == 0 + assert queue.free_list_head is None + assert queue.free_list_tail is None + + # Attempt to pop from an empty queue + with pytest.raises(ValueError) as e: + queue.popleft() + assert str(e.value) == "No free blocks available" + + +def test_free_kv_cache_block_queue_get_all_free_blocks(): + # Create a list of KVCacheBlock objects + blocks = [KVCacheBlock(block_id=i) for i in range(5)] + + # Create a FreeKVCacheBlockQueue with these blocks + queue = FreeKVCacheBlockQueue(blocks) + + # Check all blocks are correctly retrieved + assert queue.get_all_free_blocks() == blocks + + # Pop a block and check again + queue.popleft() + assert queue.get_all_free_blocks() == blocks[1:] + + # Remove a block and check again + block_to_remove = blocks[2] + queue.remove(block_to_remove) + assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:] + + # Append a block back and check again + queue.append(block_to_remove) + assert queue.get_all_free_blocks() == \ + blocks[1:2] + blocks[3:] + [block_to_remove] + + +def test_generate_block_hash_extra_keys(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(20)], + mm_positions=[{ + "offset": 0, + "length": 5 + }, { + "offset": 10, + "length": 5 + }], + mm_hashes=["hash1", "hash2"], + ) + + # Test with no extra keys + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) + assert extra_keys == (("hash1", 0), ) + assert next_mm_idx == 1 + + # Test with partial overlap + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0) + assert extra_keys == (("hash1", 3), ) + assert next_mm_idx == 1 + + # Test with no overlap + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0) + assert extra_keys == () + assert next_mm_idx == 1 + + # Test with multiple extra keys + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0) + assert extra_keys == (("hash1", 0), ("hash2", 0)) + assert next_mm_idx == 2 + + +def test_generate_block_hash_extra_keys_no_mm_inputs(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(6)], + mm_positions=None, + mm_hashes=None, + ) + + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) + assert extra_keys is None + assert next_mm_idx == 0 + + +def test_hash_block_tokens(): + parent_block_hash = 123 + curr_block_token_ids = (1, 2, 3) + extra_keys = ("key1", "key2") + + block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids, + extra_keys) + assert isinstance(block_hash, BlockHashType) + assert block_hash.hash_value == hash( + (parent_block_hash, *curr_block_token_ids)) + assert block_hash.token_ids == curr_block_token_ids + assert block_hash.extra_keys == extra_keys + + +def test_hash_request_tokens(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(6)], + mm_positions=[{ + "offset": 0, + "length": 3 + }, { + "offset": 3, + "length": 3 + }], + mm_hashes=["hash1", "hash2"], + ) + + block_size = 3 + block_hashes = hash_request_tokens(block_size, request) + + assert len(block_hashes) == 2 + assert isinstance(block_hashes[0], BlockHashType) + assert isinstance(block_hashes[1], BlockHashType) + + # Check the first block + assert block_hashes[0].token_ids == (0, 1, 2) + assert block_hashes[0].extra_keys == (("hash1", 0), ) + + # Check the second block + assert block_hashes[1].token_ids == (3, 4, 5) + assert block_hashes[1].extra_keys == (("hash2", 0), ) + + +def test_hash_request_tokens_no_mm_inputs(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(6)], + mm_positions=None, + mm_hashes=None, + ) + + block_size = 3 + block_hashes = hash_request_tokens(block_size, request) + + assert len(block_hashes) == 2 + assert block_hashes[0].token_ids == (0, 1, 2) + assert block_hashes[0].extra_keys is None + assert block_hashes[1].token_ids == (3, 4, 5) + assert block_hashes[1].extra_keys is None From 300acb83472512b14ec7ba8cdf45efe07e8c8f68 Mon Sep 17 00:00:00 2001 From: Yan Burman Date: Sat, 4 Jan 2025 08:50:16 +0200 Subject: [PATCH 011/309] [Core][Bugfix] Use correct device to initialize GPU data during CUDA-graph-capture (#11233) Signed-off-by: Yan Burman Signed-off-by: Ido Asraff --- tests/distributed/test_custom_all_reduce.py | 2 +- tests/distributed/test_pynccl.py | 2 +- vllm/distributed/parallel_state.py | 7 +++--- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/worker/model_runner.py | 25 +++++++++++++-------- 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 86ca1948ef..4072616fd3 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): for sz in test_sizes: for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with graph_capture() as graph_capture_context: + with graph_capture(device=device) as graph_capture_context: # use integers so result matches NCCL exactly inp1 = torch.randint(1, 16, (sz, ), diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 3e9b0e10a1..36cfe42251 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -107,7 +107,7 @@ def multiple_allreduce_with_vllm_worker_fn(): device = torch.device(f"cuda:{torch.distributed.get_rank()}") ensure_model_parallel_initialized(2, 2) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) - with graph_capture(): + with graph_capture(device=device): # two tp groups can communicate independently if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index e6768467f4..a0d4235460 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -920,7 +920,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent: @contextmanager -def graph_capture(): +def graph_capture(device: torch.device): """ `graph_capture` is a context manager which should surround the code that is capturing the CUDA graph. Its main purpose is to ensure that the @@ -934,8 +934,9 @@ def graph_capture(): in order to explicitly distinguish the kernels to capture from other kernels possibly launched on background in the default stream. """ - with get_tp_group().graph_capture() as context, get_pp_group( - ).graph_capture(context): + context = GraphCaptureContext(torch.cuda.Stream(device=device)) + with get_tp_group().graph_capture(context), get_pp_group().graph_capture( + context): yield context diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 75098b0330..294c76cfb6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -836,7 +836,7 @@ class GPUModelRunner: # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. - with graph_capture(): + with graph_capture(device=self.device): for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 637fba2361..1c6d1bbee7 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1426,10 +1426,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): # Prepare dummy inputs. These will be reused for all batch sizes. max_batch_size = self.max_batchsize_to_capture - input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() - input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() + input_tokens = torch.zeros(max_batch_size, + dtype=torch.long, + device=self.device) + input_positions = torch.zeros(max_batch_size, + dtype=torch.long, + device=self.device) if self.model_config.uses_mrope: - input_positions = torch.tile(input_positions, (3, 1)) + input_positions = torch.tile(input_positions, + (3, 1)).cuda(device=self.device) # Prepare dummy previous_hidden_states only if needed by the model. # This is used by draft models such as EAGLE. previous_hidden_states = None @@ -1448,8 +1453,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): dtype=self.model_config.dtype, device=self.device) - with self.attn_state.graph_capture( - max_batch_size), graph_capture() as graph_capture_context: + with self.attn_state.graph_capture(max_batch_size), graph_capture( + self.device) as graph_capture_context: # NOTE: Capturing the largest batch size first may help reduce the # memory usage of CUDA graph. for virtual_engine in range( @@ -1549,10 +1554,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): """ # During the decode phase encoder_input_ids and encoder_positions are # unset. Do the same thing for graph capture. - capture_inputs["encoder_input_ids"] = torch.tensor( - [], dtype=torch.long).cuda() - capture_inputs["encoder_positions"] = torch.tensor( - [], dtype=torch.long).cuda() + capture_inputs["encoder_input_ids"] = torch.tensor([], + dtype=torch.long, + device=self.device) + capture_inputs["encoder_positions"] = torch.tensor([], + dtype=torch.long, + device=self.device) @property def vocab_size(self) -> int: From eed11ebee93e9d137ac74d8e6e97427354bd3797 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 4 Jan 2025 19:40:53 +0800 Subject: [PATCH 012/309] [VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717) Signed-off-by: DarkLight1337 --- .../__init__.py | 0 .../test_idefics3.py | 0 .../test_internvl.py | 0 .../processing/test_llava_next.py | 58 ++ .../processing/test_llava_onevision.py | 59 ++ .../test_phi3v.py | 44 +- .../test_qwen.py | 0 .../test_qwen2_vl.py | 39 +- .../vision_language/test_models.py | 9 +- .../vision_language/test_qwen2_vl.py | 127 ----- tests/multimodal/test_processing.py | 170 +++--- vllm/model_executor/models/aria.py | 5 +- vllm/model_executor/models/blip2.py | 5 +- vllm/model_executor/models/chameleon.py | 5 +- vllm/model_executor/models/clip.py | 11 +- vllm/model_executor/models/fuyu.py | 5 +- vllm/model_executor/models/llava.py | 75 ++- vllm/model_executor/models/llava_next.py | 15 +- .../model_executor/models/llava_next_video.py | 283 +++++----- vllm/model_executor/models/llava_onevision.py | 507 ++++++++---------- vllm/model_executor/models/phi3v.py | 26 +- vllm/model_executor/models/pixtral.py | 11 +- vllm/model_executor/models/qwen2_audio.py | 15 +- vllm/model_executor/models/qwen2_vl.py | 199 ++++--- vllm/model_executor/models/siglip.py | 11 +- vllm/model_executor/models/ultravox.py | 11 +- vllm/model_executor/models/vision.py | 37 +- vllm/multimodal/parse.py | 14 + vllm/multimodal/processing.py | 320 +++++++---- vllm/multimodal/registry.py | 3 +- vllm/transformers_utils/tokenizer.py | 13 + 31 files changed, 1104 insertions(+), 973 deletions(-) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/__init__.py (100%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_idefics3.py (100%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_internvl.py (100%) create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_next.py create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_onevision.py rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_phi3v.py (60%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen.py (100%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen2_vl.py (64%) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/processing/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py rename to tests/models/decoder_only/vision_language/processing/__init__.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/processing/test_idefics3.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py rename to tests/models/decoder_only/vision_language/processing/test_idefics3.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/processing/test_internvl.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py rename to tests/models/decoder_only/vision_language/processing/test_internvl.py diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py new file mode 100644 index 0000000000..6772130c9b --- /dev/null +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -0,0 +1,58 @@ +import pytest +from PIL import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputProcessingContext + +from ....utils import build_model_context + + +# Fixtures lazy import to avoid initializing CUDA during test collection +@pytest.fixture() +def processor_for_llava_next(): + from vllm.model_executor.models.llava_next import ( + LlavaNextMultiModalProcessor) + return LlavaNextMultiModalProcessor + + +# FIXME: image_size [(198, 176), (176, 198)] +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), + (488, 183)]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements( + processor_for_llava_next, + model_id: str, + image_size: tuple[int, int], + num_imgs: int, +): + """ + Ensure LlavaNextMultiModalProcessor handles prompt replacement properly. + """ + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ctx = InputProcessingContext(ctx.model_config, tokenizer) + + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} + + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processor = processor_for_llava_next(ctx) + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 1 + assert first_placeholder["length"] == ( + len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py new file mode 100644 index 0000000000..71adde6568 --- /dev/null +++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py @@ -0,0 +1,59 @@ +import pytest +from PIL import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputProcessingContext + +from ....utils import build_model_context + + +# Fixtures lazy import to avoid initializing CUDA during test collection +@pytest.fixture() +def processor_for_llava_onevision(): + from vllm.model_executor.models.llava_onevision import ( + LlavaOnevisionMultiModalProcessor) + return LlavaOnevisionMultiModalProcessor + + +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), + (488, 183), (198, 176), (176, 198)]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements( + processor_for_llava_onevision, + model_id: str, + image_size: tuple[int, int], + num_imgs: int, +): + """ + Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement + properly. + """ + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ctx = InputProcessingContext(ctx.model_config, tokenizer) + + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} + + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processor = processor_for_llava_onevision(ctx) + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 0 + assert first_placeholder["length"] == len( + processed_inputs["prompt_token_ids"]) // num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py similarity index 60% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py rename to tests/models/decoder_only/vision_language/processing/test_phi3v.py index 3edf96d111..249045b3c0 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py @@ -1,6 +1,4 @@ """Tests for phi3v's multimodal preprocessing kwargs.""" -from typing import Optional - import pytest from transformers import AutoTokenizer @@ -10,8 +8,6 @@ from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from .....conftest import _ImageAssets from ....utils import build_model_context -models = ["microsoft/Phi-3.5-vision-instruct"] - # Wrap lazy imports to avoid initializing CUDA during test collection @pytest.fixture() @@ -20,40 +16,40 @@ def processor_for_phi3v(): return Phi3VMultiModalProcessor -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) +# yapf: disable @pytest.mark.parametrize( - "num_crops,expected_toks_per_img", + ("mm_processor_kwargs", "expected_toks_per_img"), [ - (4, 757), - (16, 1921), + ({"num_crops": 4}, 757), + ({"num_crops": 16}, 1921), # the default num_crops of phi-3.5-vision is 4 - (None, 757), + ({}, 757), ]) +# yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, - model: str, num_crops: Optional[int], - expected_toks_per_img: int, num_imgs: int): +def test_processor_override( + processor_for_phi3v, + image_assets: _ImageAssets, + model_id: str, + mm_processor_kwargs: dict[str, int], + expected_toks_per_img: int, + num_imgs: int, +): """Ensure input_processor_for_phi3v handles num_crops properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. ctx = build_model_context( - model_name=model, - tokenizer_name=model, + model_name=model_id, + tokenizer_name=model_id, trust_remote_code=True, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) + # Build the image str / prompt based on the number of images we pass img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" - images = [image_assets[0].pil_image] * num_imgs - - mm_data = {"image": images} - mm_processor_kwargs = {} - if num_crops is not None: - mm_processor_kwargs = {"num_crops": num_crops} + mm_data = {"image": [image_assets[0].pil_image] * num_imgs} processor = processor_for_phi3v(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/processing/test_qwen.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py rename to tests/models/decoder_only/vision_language/processing/test_qwen.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py similarity index 64% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py rename to tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py index 1f0b482666..b9ac887edf 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py @@ -1,5 +1,3 @@ -from typing import Any, Dict, Tuple - import pytest from transformers import AutoTokenizer @@ -8,56 +6,45 @@ from vllm.inputs import InputProcessingContext from .....conftest import _ImageAssets from ....utils import build_model_context -MODEL = "Qwen/Qwen2-VL-2B-Instruct" -MIN_PIXELS = "min_pixels" -MAX_PIXELS = "max_pixels" - # Fixtures lazy import to avoid initializing CUDA during test collection -# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple -# input mappers. @pytest.fixture() def processor_for_qwen2_vl(): from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor return Qwen2VLMultiModalProcessor +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) +# yapf: disable @pytest.mark.parametrize( - "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [ + ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [ ({}, 1426, (5704, 1176)), - ({ - MIN_PIXELS: 64**2, - MAX_PIXELS: 512**2 - }, 330, (1320, 1176)), + ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)), ]) -@pytest.mark.parametrize("model", [MODEL]) +# yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( processor_for_qwen2_vl, image_assets: _ImageAssets, - model: str, - mm_processor_kwargs: Dict[str, Any], + model_id: str, + mm_processor_kwargs: dict[str, object], expected_toks_per_img: int, - expected_pixels_shape: Tuple[int, int], + expected_pixels_shape: tuple[int, int], num_imgs: int, ): """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. ctx = build_model_context( - model_name=model, - tokenizer_name=model, + model_name=model_id, + tokenizer_name=model_id, mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) + # Build the image str / prompt based on the number of images we pass prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs - images = [image_assets[0].pil_image] * num_imgs - - mm_data = {"image": images} + mm_data = {"image": [image_assets[0].pil_image] * num_imgs} processor = processor_for_qwen2_vl(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7db0816682..dc0b683c1f 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -274,10 +274,8 @@ VLM_TEST_SETTINGS = { ), limit_mm_per_prompt={"image": 4}, )], - # Llava-next tests fixed sizes & the default size factors - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], ), - "llava_one_vision": VLMTestInfo( + "llava_onevision": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 @@ -288,8 +286,6 @@ VLM_TEST_SETTINGS = { ), auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, - # Llava-one-vision tests fixed sizes & the default size factors - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 @@ -306,7 +302,6 @@ VLM_TEST_SETTINGS = { max_model_len=4096, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], ), "mantis": VLMTestInfo( models=["TIGER-Lab/Mantis-8B-siglip-llama3"], @@ -431,7 +426,7 @@ VLM_TEST_SETTINGS = { ) for inp in custom_inputs.different_patch_input_cases_internvl() ], ), - "llava_one_vision-multiple-images": VLMTestInfo( + "llava_onevision-multiple-images": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 51fe7d2ad3..16e256e040 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -427,130 +427,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, mm_limit=1, tensor_parallel_size=1, ) - - -def run_chunked_prefill_test( - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Compare inference result between - chunked prefill disabled and chunked prefill enabled - """ - - # NOTE: - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - task="generate", - max_model_len=4000, - max_num_seqs=4, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - - outputs_per_case = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images or None, - videos=videos or None) - for prompts, images, videos in inputs - ] - - with vllm_runner( - model, - task="generate", - max_model_len=4000, - max_num_seqs=4, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enable_chunked_prefill=True, - # should be small enough to ensure prefilling is chunked - max_num_batched_tokens=32, - mm_processor_kwargs={ - "max_pixels": 16 * 28 * 28, - }) as vllm_model_chunked: - outputs_per_case_chunked = [ - vllm_model_chunked.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images or None, - videos=videos or None) for prompts, images, videos in inputs - ] - - for outputs, \ - outputs_chunked \ - in zip(outputs_per_case, - outputs_per_case_chunked): - check_logprobs_close( - outputs_0_lst=outputs, - outputs_1_lst=outputs_chunked, - name_0="non_chunked", - name_1="chunked", - ) - - -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [1]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts, - model: str, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: - """ - Test Qwen2-VL's chunked prefill with M-RoPE - """ - prompts = [ - qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt) - for prompt in example_prompts[:1] - ] - - # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs, - # so an image is included in the inputs - # 2. however, Qwen2-VL currently won't work properly - # when chunked prefill is enabled and there are some multi-modal inputs, - # here use a hacky way: provide a **zero-length** image to make it happy - # - # and finally we achieved: - # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests - zero_len_image = { - "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)), - "image_grid_thw": torch.tensor([[0, 0, 0]]) - } - images = [zero_len_image] * len(prompts) - - inputs_per_case: List[Tuple[List[str], PromptImageInput, - PromptVideoInput]] = [ - (prompts, images, []), - ] - - run_chunked_prefill_test( - vllm_runner, - inputs_per_case, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index f99d7556b2..b32faa699e 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -11,8 +11,8 @@ from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, - _PlaceholderInfo, find_text_matches, - find_token_matches, iter_placeholders, + _PlaceholderInfo, find_mm_placeholders, + find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) @@ -314,21 +314,27 @@ def test_find_replace_text( # Should not be used since there is nothing to convert to text mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) + mm_prompt_repls = { + key: [ + PromptReplacement(key, target, + repl_by_key[key]).bind(mock_tokenizer) + ] for key, target in target_by_key.items() - ] - matches = find_text_matches(prompt, prompt_repls) + } + mm_matches = { + key: find_text_matches(prompt, prompt_repls) + for key, prompt_repls in mm_prompt_repls.items() + } result = replace_text_matches( prompt, - matches, + mm_matches, {key: mm_count for key in repl_by_key}, ) # Only displayed on error - print("matches:", matches) + print("mm_matches:", mm_matches) print("result:", result) # Manually constructed results @@ -380,21 +386,27 @@ def test_find_replace_tokens( # Should not be used since there is nothing to convert to tokens mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) + mm_prompt_repls = { + key: [ + PromptReplacement(key, target, + repl_by_key[key]).bind(mock_tokenizer) + ] for key, target in target_by_key.items() - ] - matches = find_token_matches(prompt, prompt_repls) + } + mm_matches = { + key: find_token_matches(prompt, prompt_repls) + for key, prompt_repls in mm_prompt_repls.items() + } result = replace_token_matches( prompt, - matches, + mm_matches, {key: mm_count for key in repl_by_key}, ) # Only displayed on error - print("matches:", matches) + print("mm_matches:", mm_matches) print("result:", result) # Manually constructed results @@ -417,58 +429,76 @@ def test_find_replace_tokens( [ ( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=6, - replacement=[32000, 32000], - ), - ], + { + "pattern_1": [ + _PlaceholderInfo( + modality="pattern_1", + item_idx=0, + start_idx=6, + replacement=[32000, 32000], + ), + ], + } + ), ( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=1, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_1", - start_idx=5, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_3", - start_idx=7, - replacement=[1550, 918, 1550], - ), - ], + { + "pattern_1": [ + _PlaceholderInfo( + modality="pattern_1", + item_idx=0, + start_idx=1, + replacement=[32000, 32000], + ), + _PlaceholderInfo( + modality="pattern_1", + item_idx=1, + start_idx=5, + replacement=[32000, 32000], + ), + ], + "pattern_3": [ + _PlaceholderInfo( + modality="pattern_3", + item_idx=0, + start_idx=7, + replacement=[1550, 918, 1550], + ), + ], + } ), ( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=1, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_1", - start_idx=3, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_3", - start_idx=6, - replacement=[1550, 918, 1550], - ), - ], + { + "pattern_1": [ + _PlaceholderInfo( + modality="pattern_1", + item_idx=0, + start_idx=1, + replacement=[32000, 32000], + ), + _PlaceholderInfo( + modality="pattern_1", + item_idx=1, + start_idx=3, + replacement=[32000, 32000], + ), + ], + "pattern_3": [ + _PlaceholderInfo( + modality="pattern_3", + item_idx=0, + start_idx=6, + replacement=[1550, 918, 1550], + ), + ], + } ), ] ) # yapf: enable -def test_iter_placeholders( +def test_find_mm_placeholders( repl_by_key, prompt, expected, @@ -476,19 +506,18 @@ def test_iter_placeholders( # Should not be used since there is nothing to convert to tokens mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, [], repl).bind(mock_tokenizer) + mm_prompt_repls = { + key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)] for key, repl in repl_by_key.items() - ] + } - result = list( - iter_placeholders( - prompt_repls, - prompt, - # Effectively match all occurrences in the prompt - {key: 3 - for key in repl_by_key}, - )) + result = find_mm_placeholders( + mm_prompt_repls, + prompt, + # Effectively match all occurrences in the prompt + {key: 3 + for key in repl_by_key}, + ) # Only displayed on error print("result:", result) @@ -694,7 +723,10 @@ def _test_processing_cache_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text + prompt = baseline_processor._get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt_text # Drop unnecessary keys and test single -> multi conversion if rng.rand() < simplify_rate: @@ -728,6 +760,8 @@ def _test_processing_cache_correctness( ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), + ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), ("mistral-community/pixtral-12b", {"image": True}), ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 4f0d679bd6..2fd4262a9d 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -456,7 +456,7 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor): hf_config = self.ctx.get_hf_config() return max(hf_config.projector_patch_to_query_dict.values()) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} def _get_mm_fields_config( @@ -488,8 +488,9 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor): ) ] - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config() diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 0fe10d8585..b3ecb2f22d 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -405,7 +405,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor): hf_config = self.ctx.get_hf_config(Blip2Config) return hf_config.num_query_tokens - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} def _get_hf_processor(self) -> Blip2Processor: @@ -457,8 +457,9 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor): return result - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config(Blip2Config) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 0bd0194243..1ad44678a5 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -57,7 +57,7 @@ class ChameleonMultiModalProcessor(BaseMultiModalProcessor): processor = self._get_hf_processor() return processor.image_seq_length - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} def _get_hf_processor(self) -> ChameleonProcessor: @@ -90,8 +90,9 @@ class ChameleonMultiModalProcessor(BaseMultiModalProcessor): ) ] - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: config = self.ctx.get_hf_config(ChameleonConfig) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 0188452054..1bde45cb14 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -164,15 +164,18 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): def get_max_image_tokens(self) -> int: return get_max_clip_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_clip_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa class CLIPVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 3680d01725..7cd58fbc7c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -96,7 +96,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): nrows = math.ceil(image_height / 30) return ncols, nrows - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: target_width, target_height = self._get_image_target_size() max_ncols, max_nrows = self._get_image_feature_grid_size( @@ -208,8 +208,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): return result - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: target_width, target_height = self._get_image_target_size() diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 78de27cd82..d522378e0b 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -25,11 +25,9 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize) -from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, +from vllm.multimodal.processing import (InputProcessingContext, MultiModalDataItems, ProcessingCache, - ProcessorInputs, PromptReplacement, - full_groupby_modality) + ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel @@ -39,7 +37,7 @@ from .pixtral import (PixtralHFVisionModel, from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import vision_encoder_info +from .vision import BaseVisionLanguageMultiModalProcessor class LlavaImagePixelInputs(TypedDict): @@ -100,19 +98,7 @@ class LlavaLikeConfig(Protocol): vision_feature_layer: Final[Union[int, List[int]]] -class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): - - def __init__(self, - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: - super().__init__(ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks) - - vision_config = self._get_hf_config().vision_config - self._vision_encoder_info = vision_encoder_info(vision_config) +class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): @abstractmethod def _get_hf_config(self) -> LlavaLikeConfig: @@ -121,6 +107,19 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self._get_max_image_tokens()} + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _apply_feature_select_strategy( self, strategy: str, @@ -142,19 +141,6 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): self._vision_encoder_info.get_max_image_tokens(), ) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - return {"image": self._get_max_image_tokens()} - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - def _get_dummy_image_size(self) -> ImageSize: image_size = self._vision_encoder_info.get_image_size() return ImageSize(image_size, image_size) @@ -163,8 +149,9 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): def _get_image_token(self) -> str: raise NotImplementedError - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) @@ -709,7 +696,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ")", # 3 tokens ]) - mantis_repls = self._bind_prompt_replacements([ + mantis_mm_repls = self._bind_and_group_repls([ PromptReplacement( modality="image", target=[image_token_id] * num_image_tokens, @@ -719,7 +706,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): prompt_ids, prompt_text, _ = self._apply_prompt_replacements( result["prompt_token_ids"], - mantis_repls, + mantis_mm_repls, mm_item_counts, ) @@ -728,15 +715,19 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): hf_processor_mm_kwargs, mm_kwargs, ) - orig_repls = self._bind_prompt_replacements(unbound_orig_repls) + orig_repls = self._bind_and_group_repls(unbound_orig_repls) - all_placeholders = self._find_placeholders(orig_repls, prompt_ids, - mm_item_counts) - assert len(all_placeholders) == mm_item_counts.get("image", 0) + mm_placeholders = self._find_mm_placeholders( + orig_repls, + prompt_ids, + mm_item_counts, + ) - mm_placeholders = { - modality: [item.to_range() for item in items] - for modality, items in full_groupby_modality(all_placeholders) + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) + + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() } return MultiModalInputsV2( @@ -744,7 +735,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): prompt=prompt_text, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_placeholders=mm_placeholders, + mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 24debd1cbf..3769f04f94 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -67,9 +67,6 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): def _get_hf_processor(self) -> LlavaNextProcessor: return self.ctx.get_hf_processor(LlavaNextProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -81,6 +78,9 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): image_embeds=MultiModalFieldConfig.batched("image"), ) + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token + def _get_max_image_tokens(self) -> int: largest_feature_size, _ = self._get_pinpoint_with_most_features() return largest_feature_size @@ -97,20 +97,20 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): image_height: int, ) -> int: hf_config = self._get_hf_config() + vision_encoder_info = self._vision_encoder_info base_feature_size = self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_num_image_tokens( + vision_encoder_info.get_num_image_tokens( image_width=image_width, image_height=image_height, ), ) - num_patches = self._vision_encoder_info.get_num_patches() num_patch_height, num_patch_width = get_anyres_image_grid_shape( image_size=(image_height, image_width), grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=self._vision_encoder_info.get_image_size(), + patch_size=vision_encoder_info.get_image_size(), ) ( @@ -119,7 +119,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): ) = self._get_num_unpadded_features( original_height=image_height, original_width=image_width, - npatches=num_patches, + npatches=vision_encoder_info.get_patch_grid_length(), num_patch_height=num_patch_height, num_patch_width=num_patch_width, ) @@ -155,6 +155,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): unpadded_features = current_height * current_width newline_features = current_height + return (unpadded_features, newline_features) def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 0de9d8c5ea..ee6b89f0d4 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -3,38 +3,32 @@ from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from transformers import (CLIPVisionConfig, LlavaNextVideoConfig, - SiglipVisionConfig) +from transformers import (BatchFeature, LlavaNextVideoConfig, + LlavaNextVideoProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) +from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - dummy_seq_data_for_siglip) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) - -# For profile run -_MAX_FRAMES_PER_VIDEO = 32 -_MAX_NUM_VIDEOS = 1 +from .vision import BaseVisionLanguageMultiModalProcessor class LlavaNextVideoPixelInputs(TypedDict): @@ -50,144 +44,149 @@ class LlavaNextVideoPixelInputs(TypedDict): """ -def get_llava_next_video_frame_feature_size( - hf_config: LlavaNextVideoConfig) -> int: - # Support both CLIPVisionConfig and SiglipVisionConfig - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - spatial_pool_stride = hf_config.spatial_pool_stride +class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): - return int((image_size / patch_size / spatial_pool_stride)**2) + def _get_hf_config(self) -> LlavaNextVideoConfig: + return self.ctx.get_hf_config(LlavaNextVideoConfig) + def _get_hf_processor(self) -> LlavaNextVideoProcessor: + return self.ctx.get_hf_processor(LlavaNextVideoProcessor) -def _get_max_llm_tokens(ctx: InputContext) -> int: - """ - Calculated from the maximum video frames under the context length - constraints of the language model. - """ - hf_text_config = ctx.model_config.hf_text_config - model_config = ctx.model_config - max_tokens = model_config.max_model_len - rope_scaling = model_config.rope_scaling + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"video": 1} - if rope_scaling: - rope_scaling_factor = hf_text_config.rope_scaling["factor"] - else: - rope_scaling_factor = 1 + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + num_frames = self._get_dummy_num_frames(seq_len) + max_video_tokens = self._get_max_video_tokens(num_frames) - max_tokens *= rope_scaling_factor + return {"video": max_video_tokens} - return max_tokens + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) + def _get_num_frame_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + spatial_pool_stride = hf_config.spatial_pool_stride -def get_max_llava_next_video_tokens(ctx: InputContext) -> int: - # Currently set to 32 frames - # TODO: max_tokens = _get_max_llm_tokens(ctx) - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) - return _MAX_FRAMES_PER_VIDEO * tokens_per_frame + patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) + return pooled_grid_length * pooled_grid_length -def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - vision_config = hf_config.vision_config - - # TODO: support multiple videos - num_videos = mm_counts["video"] - if num_videos != _MAX_NUM_VIDEOS: - raise NotImplementedError( - f"Only {_MAX_NUM_VIDEOS} videos are supported") - - # TODO: support configuring the number of frames - frames_per_video = _MAX_FRAMES_PER_VIDEO - # num_images = num_videos * frames_per_video - - # fills the sequence with as longer video data as possible - tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) - video_feature_size = frames_per_video * tokens_per_frame - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video", + def _get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + num_frame_tokens = self._get_num_frame_tokens( + image_width=image_width, + image_height=image_height, ) - pil_frame = dummy_image_for_clip(vision_config, num_images=1) - np_frame = np.array(pil_frame["image"]) - mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) - mm_data = {"video": mm_data_per_video} - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video", + return num_frame_tokens * num_frames + + def _get_max_video_tokens(self, num_frames: int) -> int: + return self._get_num_video_tokens(image_width=999999, + image_height=999999, + num_frames=num_frames) + + def _get_max_video_frames(self, max_tokens: int) -> int: + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + + if self._get_max_video_tokens(next_num_frames) > max_tokens: + break + + num_frames = next_num_frames + + return num_frames + + def _get_dummy_num_frames(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_total_frames = self._get_max_video_frames(seq_len) + + return max(max_total_frames // max(max_videos, 1), 1) + + def _get_dummy_image_size(self) -> ImageSize: + image_size = self._vision_encoder_info.get_image_size() + return ImageSize(image_size, image_size) + + def _get_video_token(self) -> str: + return self._get_hf_processor().video_token + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self._get_hf_config() + video_token_id = hf_config.video_token_index + + def get_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (VideoEmbeddingItems, VideoProcessorItems)) + + if isinstance(videos, VideoEmbeddingItems): + num_video_tokens = videos.get_feature_size(item_idx) + else: + image_size = videos.get_frame_size(item_idx) + num_video_tokens = self._get_num_video_tokens( + image_width=image_size.width, + image_height=image_size.height, + num_frames=videos.get_num_frames(item_idx), + ) + + return [video_token_id] * num_video_tokens + + return [ + PromptReplacement( + modality="video", + target=[video_token_id], + replacement=get_replacement, + ), + ] + + def _get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_videos = mm_counts.get("video", 0) + + video_token = self._get_video_token() + target_width, target_height = self._get_dummy_image_size() + + mm_data = { + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=video_token * num_videos, + mm_data=mm_data, ) - pil_frame = dummy_image_for_siglip(vision_config, num_images=1) - np_frame = np.array(pil_frame["image"]) - mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) - mm_data = {"video": mm_data_per_video} - return DummyData(seq_data, mm_data, ranges) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_for_llava_next_video(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "video" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "video" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - video_data = multi_modal_data["video"] - - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - vision_config = hf_config.vision_config - - if isinstance(video_data, np.ndarray): - # Supports both CLIP and Siglip - num_frames = video_data.shape[0] - frame_feature_size = \ - get_llava_next_video_frame_feature_size(hf_config) - video_feature_size = num_frames * frame_feature_size - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, - ) - - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - - elif is_list_of(video_data, np.ndarray): - raise NotImplementedError( - "Processing multiple videos is not supported") - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): @@ -246,11 +245,7 @@ class LlavaNextMultiModalProjector(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_input_mapper("video") -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_llava_next_video_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video) +@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 0bebc1c745..1e51e09a24 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -3,47 +3,36 @@ from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from PIL import Image -from transformers import (CLIPVisionConfig, LlavaOnevisionConfig, - SiglipVisionConfig) +from transformers import (BatchFeature, LlavaOnevisionConfig, + LlavaOnevisionProcessor) from transformers.models.llava_onevision.modeling_llava_onevision import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, + VideoProcessorItems) +from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import (CLIPVisionModel, dummy_seq_data_for_clip, - dummy_video_for_clip, get_clip_image_feature_size, - get_clip_patch_grid_length, input_processor_for_clip) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, - dummy_video_for_siglip, get_siglip_image_feature_size, - get_siglip_patch_grid_length, input_processor_for_siglip) +from .llava_next import LlavaNextMultiModalProcessor +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -# Result in the max possible feature size (2x2 grid of 336x336px tiles) -MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 - -# For profile run -_MAX_FRAMES_PER_VIDEO = 16 - class LlavaOnevisionVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] @@ -92,286 +81,251 @@ LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs, LlavaOnevisionVideoPixelInputs] -def _get_llava_onevision_image_unppaded_feature_size(height, width, patches, - scale_height, - scale_width): - current_height = patches * scale_height - current_width = patches * scale_width +class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): - original_aspect_ratio = width / height - current_aspect_ratio = current_width / current_height - if original_aspect_ratio > current_aspect_ratio: - new_height = int(height * (current_width / width)) - padding = (current_height - new_height) // 2 - current_height -= padding * 2 - else: - new_width = int(width * (current_height / height)) - padding = (current_width - new_width) // 2 - current_width -= padding * 2 + def _get_hf_config(self) -> LlavaOnevisionConfig: + return self.ctx.get_hf_config(LlavaOnevisionConfig) - unpadded_features = current_height * current_width - newline_features = current_height + def _get_hf_processor(self) -> LlavaOnevisionProcessor: + return self.ctx.get_hf_processor(LlavaOnevisionProcessor) - ratio = math.sqrt(current_height * current_width / (9 * patches**2)) - if ratio > 1.1: - unpadded_features = int(current_height // ratio) * int( - current_width // ratio) - newline_features = int(current_height // ratio) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} - return (unpadded_features, newline_features) + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + max_image_tokens = self._get_max_image_tokens() + num_frames = self._get_dummy_num_frames(seq_len) + max_video_tokens = self._get_max_video_tokens(num_frames) -def get_llava_onevision_image_feature_size( - hf_config: LlavaOnevisionConfig, - *, - input_height: int, - input_width: int, -) -> int: - vision_config = hf_config.vision_config + return { + "image": max_image_tokens, + "video": max_video_tokens, + } - if isinstance(vision_config, CLIPVisionConfig): - num_patches = get_clip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), ) - base_feature_size = get_clip_image_feature_size(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_patches = get_siglip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, + + def _get_num_unpadded_features( + self, + *, + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, + ) -> tuple[int, int]: + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + if original_aspect_ratio > current_aspect_ratio: + new_height = int(original_height * + (current_width / original_width)) + padding = (current_height - new_height) // 2 + current_height -= padding * 2 + else: + new_width = int(original_width * + (current_height / original_height)) + padding = (current_width - new_width) // 2 + current_width -= padding * 2 + + unpadded_features = current_height * current_width + newline_features = current_height + + ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) + if ratio > 1.1: + unpadded_features = int(current_height // ratio) * int( + current_width // ratio) + newline_features = int(current_height // ratio) + + return (unpadded_features, newline_features) + + def _get_num_frame_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) + + patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) + + return pooled_grid_length * pooled_grid_length + + def _get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + num_frame_tokens = self._get_num_frame_tokens( + image_width=image_width, + image_height=image_height, ) - base_feature_size = get_siglip_image_feature_size(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - base_feature_size -= 1 - elif strategy == "full": - pass - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") + return num_frame_tokens * num_frames + 1 # Newline token - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_size=(input_height, input_width), - grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=vision_config.image_size, - ) + def _get_max_video_tokens(self, num_frames: int) -> int: + return self._get_num_video_tokens(image_width=999999, + image_height=999999, + num_frames=num_frames) - ( - unpadded_feature_size, - newline_feature_size, - ) = _get_llava_onevision_image_unppaded_feature_size( - input_height, input_width, num_patches, num_patch_height, - num_patch_width) + def _get_max_video_frames(self, max_tokens: int) -> int: + num_frames = 0 - return unpadded_feature_size + newline_feature_size + base_feature_size + while True: + next_num_frames = num_frames + 1 + if self._get_max_video_tokens(next_num_frames) > max_tokens: + break -def get_max_llava_onevision_image_tokens(ctx: InputContext): - return get_llava_onevision_image_feature_size( - ctx.get_hf_config(LlavaOnevisionConfig), - input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) + num_frames = next_num_frames + return num_frames -def get_llava_onevision_video_frame_feature_size( - hf_config: LlavaOnevisionConfig) -> int: - # Support both CLIPVisionConfig and SiglipVisionConfig - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - spatial_pool_stride = hf_config.spatial_pool_stride if hasattr( - hf_config, "spatial_pool_stride") else 2 + def _get_dummy_num_frames(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) - height = width = image_size // patch_size - return math.ceil(height / spatial_pool_stride) * math.ceil( - width / spatial_pool_stride) + max_image_tokens = self._get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + return max(max_total_frames // max(max_videos, 1), 1) -def get_llava_onevision_video_tokens(ctx: InputContext, - num_frames: int) -> int: - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) + def _get_video_token(self) -> str: + return self._get_hf_processor().video_token - # TODO: support configuring (not supported by HF right now) - num_token_image_newline = 1 - tokens_per_frame = get_llava_onevision_video_frame_feature_size(hf_config) - video_feature_size = num_frames * tokens_per_frame + num_token_image_newline + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_data = dict(mm_data) + videos = mm_data.pop("videos", []) + assert isinstance(videos, list) - return video_feature_size + if not videos: + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + video_token = self._get_video_token() -def get_max_llava_onevision_video_tokens(ctx: InputContext) -> int: - return get_llava_onevision_video_tokens(ctx, _MAX_FRAMES_PER_VIDEO) - - -def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - vision_config = hf_config.vision_config - - num_videos = mm_counts["video"] - - # TODO: support configuring the number of frames - num_frames = _MAX_FRAMES_PER_VIDEO - video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video") - - mm_data = dummy_video_for_clip(vision_config, - num_frames=num_frames, - num_videos=num_videos) - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video") - - mm_data = dummy_video_for_siglip(vision_config, - num_frames=num_frames, - num_videos=num_videos) - return DummyData(seq_data, mm_data, ranges) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_when_multimodal_input_image(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - vision_config = hf_config.vision_config - - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - width, height = image_data.size - - image_feature_size = get_llava_onevision_image_feature_size( - hf_config, - input_height=height, - input_width=width, + # LLaVA-OneVision processor doesn't support multiple videos + # with different sizes when converting back to tensors + text_image_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) - elif is_list_of(image_data, Image.Image): - image_feature_size = [ - get_llava_onevision_image_feature_size(hf_config, - input_height=img.height, - input_width=img.width) - for img in image_data + + pixel_values_videos = [] + for video in videos: + item_processor_data = dict(prompt=video_token, videos=video) + + item_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=item_processor_data, + mm_kwargs=mm_kwargs, + ) + + pixel_values_videos.append( + item_outputs.pop("pixel_values_videos")[0]) + + combined_outputs = dict( + **text_image_outputs, + pixel_values_videos=pixel_values_videos, + ) + return BatchFeature(combined_outputs) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + image_repls = super()._get_prompt_replacements( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) + + hf_config = self._get_hf_config() + video_token_id = hf_config.video_token_index + + def get_video_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (VideoEmbeddingItems, VideoProcessorItems)) + + if isinstance(videos, VideoEmbeddingItems): + num_video_tokens = videos.get_feature_size(item_idx) + else: + image_size = videos.get_frame_size(item_idx) + num_video_tokens = self._get_num_video_tokens( + image_width=image_size.width, + image_height=image_size.height, + num_frames=videos.get_num_frames(item_idx), + ) + + return [video_token_id] * num_video_tokens + + return image_repls + [ + PromptReplacement( + modality="video", + target=[video_token_id], + replacement=get_video_replacement, + ), ] - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[1] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - vision_config = hf_config.vision_config + def _get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) - if isinstance(vision_config, CLIPVisionConfig): - return input_processor_for_clip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, + image_token = self._get_image_token() + video_token = self._get_video_token() + target_width, target_height = self._get_dummy_image_size() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, ) - elif isinstance(vision_config, SiglipVisionConfig): - return input_processor_for_siglip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_when_multimodal_input_video(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "video" not in multi_modal_data: - return inputs - video_data = multi_modal_data["video"] - - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - - if isinstance(video_data, np.ndarray): - # Supports both CLIP and Siglip - num_frames = video_data.shape[0] - video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, - ) - - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - - elif is_list_of(video_data, np.ndarray): - video_feature_size = [] - for video in video_data: - num_frames = video.shape[0] - video_feature_size.append( - get_llava_onevision_video_tokens(ctx, num_frames)) - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, - ) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - else: - raise TypeError(f"Invalid video type: {type(video_data)}") - - msg = f"Unsupported video type: {type(video_data)}" - raise NotImplementedError(msg) - - -def input_processor_for_llava_onevision(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or ("video" not in multi_modal_data - and "image" not in multi_modal_data): - return inputs - if "image" in multi_modal_data: - return input_processor_when_multimodal_input_image(ctx, inputs) - if "video" in multi_modal_data: - return input_processor_when_multimodal_input_video(ctx, inputs) - - msg = "Unsupported multi data type" - raise NotImplementedError(msg) class LlavaOnevisionMultiModalProjector(nn.Module): @@ -394,14 +348,7 @@ class LlavaOnevisionMultiModalProjector(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_input_mapper("video") -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "image", get_max_llava_onevision_image_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_llava_onevision_video_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision) +@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index f2e49d8e48..7aa9d58d1d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -323,7 +323,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): height=image_height, ) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: max_image_tokens = self._get_num_image_tokens( image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, @@ -415,12 +415,12 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): def _apply_prompt_replacements( self, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, - prompt_repls=prompt_repls, + mm_prompt_repls=mm_prompt_repls, mm_item_counts=mm_item_counts, ) @@ -428,15 +428,23 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): if text.startswith(" <|image|>"): text = text.replace(" <|image|>", "<|image|>", 1) token_ids = [token_ids[0], *token_ids[2:]] - placeholders = [ - _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement) - for p in placeholders - ] + placeholders = { + modality: [ + _PlaceholderInfo( + modality=p.modality, + item_idx=p.item_idx, + start_idx=p.start_idx - 1, + replacement=p.replacement, + ) for p in ps + ] + for modality, ps in placeholders.items() + } return token_ids, text, placeholders - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d7233bd602..9e1d38512c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -780,15 +780,18 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): def get_max_image_tokens(self) -> int: return get_max_pixtral_hf_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_pixtral_hf_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - class PixtralHFMLP(nn.Module): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d050fd0603..bc3bb1f79b 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -84,7 +84,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) max_source_positions = hf_config.audio_config.max_source_positions max_output_lengths = (max_source_positions - 2) // 2 + 1 @@ -184,15 +184,16 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): ] def _always_apply_prompt_replacements(self) -> bool: - # HF never applies prompt replacements, so we have to do it ourselves - # _find_placeholders may incorrectly think that HF has already performed - # processing for multi-audio input when the input audios are short - # (the corresponding placeholders may take up fewer tokens than - # the number of audio items) + # HF never applies prompt replacements, so we have to do it ourselves. + # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF + # has already performed processing for multi-audio input when the input + # audios are short (the corresponding placeholders may take up fewer + # tokens than the number of audio items) return True - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 5a8c6e4deb..abca85e0e2 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -56,7 +56,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, NestedTensors, VideoItem) -from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser +from vllm.multimodal.parse import (ImageSize, ModalityDataItems, + MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -641,58 +642,6 @@ class Qwen2VisionTransformer(nn.Module): return loaded_params -# === Vision input helpers === # - - -def _get_vision_info( - vision_config: Qwen2VLVisionConfig, - height: int, - width: int, - min_pixels: int, - max_pixels: int, - *, - do_resize: bool = True, - modality: str = "image", - mm_count: int = 1, -): - """Get information (resized height / width and number of vision tokens) - of input image / video frame.""" - patch_size = vision_config.patch_size - merge_size = vision_config.spatial_merge_size - temporal_patch_size = vision_config.temporal_patch_size - - if do_resize: - resized_height, resized_width = smart_resize( - height=height, - width=width, - factor=patch_size * merge_size, - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - else: - resized_height, resized_width = height, width - - if modality == "image": - grid_t = mm_count - elif modality == "video": - grid_t = max(mm_count // temporal_patch_size, 1) - else: - raise ValueError(f"Modality {modality} is not supported") - - grid_h = resized_height // patch_size - grid_w = resized_width // patch_size - vision_tokens = grid_t * grid_h * grid_w - llm_num_vision_tokens = vision_tokens // (merge_size**2) - - return resized_height, resized_width, llm_num_vision_tokens - - -def _get_image_processor(hf_processor: Qwen2VLProcessor): - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, Qwen2VLImageProcessor) - return image_processor - - class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], dict[str, torch.Tensor]]): @@ -764,32 +713,111 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def _get_max_mm_tokens(self, modality: str) -> int: + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 1, + do_resize: bool = True, + ) -> tuple[ImageSize, int]: hf_config = self.ctx.get_hf_config(Qwen2VLConfig) vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + temporal_patch_size = vision_config.temporal_patch_size hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) + image_processor = self._get_image_processor(hf_processor) - _, _, max_llm_image_tokens = _get_vision_info( - vision_config, - height=9999999, - width=9999999, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - modality=modality, + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, + height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, + height=image_height) + + grid_t = max(num_frames // temporal_patch_size, 1) + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_vision_tokens = num_patches // (merge_size**2) + + return preprocessed_size, num_vision_tokens + + def _get_dummy_image_size(self) -> ImageSize: + max_image_size, _ = self._get_vision_info( + image_width=9999999, + image_height=9999999, ) - return max_llm_image_tokens + return max_image_size + + def _get_max_image_tokens(self) -> int: + _, max_image_tokens = self._get_vision_info( + image_width=9999999, + image_height=9999999, + ) + return max_image_tokens + + def _get_max_video_tokens(self, num_frames: int) -> int: + _, max_video_tokens = self._get_vision_info( + image_width=9999999, + image_height=9999999, + num_frames=num_frames, + ) + return max_video_tokens + + def _get_max_video_frames(self, max_tokens: int) -> int: + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + + if self._get_max_video_tokens(next_num_frames) > max_tokens: + break + + num_frames = next_num_frames + + return num_frames + + def _get_dummy_num_frames(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_image_tokens = self._get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + + return max(max_total_frames // max(max_videos, 1), 1) + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + max_image_tokens = self._get_max_image_tokens() + + num_frames = self._get_dummy_num_frames(seq_len) + max_video_tokens = self._get_max_video_tokens(num_frames) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: return { - "image": self._get_max_mm_tokens("image"), - "video": self._get_max_mm_tokens("video"), + "image": max_image_tokens, + "video": max_video_tokens, } def _get_data_parser(self) -> MultiModalDataParser: return Qwen2MultiModalDataParser() + def _get_image_processor(self, hf_processor: Qwen2VLProcessor): + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + return image_processor + def _get_hf_processor( self, *, @@ -797,7 +825,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): max_pixels: Optional[int] = None, ) -> Qwen2VLProcessor: hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = _get_image_processor(hf_processor) + image_processor = self._get_image_processor(hf_processor) if min_pixels: image_processor.min_pixels = min_pixels @@ -818,7 +846,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) + image_processor = self._get_image_processor(hf_processor) # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered @@ -873,32 +901,35 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): video_grid_thw=MultiModalFieldConfig.batched("video"), ) - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) - - image_token: str = hf_processor.image_token - resized_height, resized_width = smart_resize( - height=9999999, - width=9999999, - factor=image_processor.patch_size * image_processor.merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - ) num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token + video_token: str = hf_processor.video_token + target_width, target_height = self._get_dummy_image_size() mm_data = { "image": - self._get_dummy_images(width=resized_width, - height=resized_height, - num_images=num_images) + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) } return ProcessorInputs( - prompt_text=image_token * num_images, + prompt_text=image_token * num_images + video_token * num_videos, mm_data=mm_data, ) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 115eaaac90..7ea177e94a 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -171,15 +171,18 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]): def get_max_image_tokens(self) -> int: return get_max_siglip_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_siglip_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa class SiglipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0b83684c9b..6ad4661e3b 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -6,7 +6,6 @@ from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.utils.checkpoint from torch import nn @@ -31,7 +30,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, @@ -62,7 +60,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: feature_extractor = self._get_feature_extractor() max_audio_tokens = math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) @@ -103,6 +101,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): mm_data = dict(mm_data) audios = mm_data.pop("audios", []) + assert isinstance(audios, list) if not audios: return super()._call_hf_processor( @@ -117,9 +116,6 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): sampling_rate=feature_extractor.sampling_rate, ) - # Already resampled by _get_hf_mm_data - assert is_list_of(audios, np.ndarray) - # Ultravox processor doesn't support multiple inputs, # therefore we need to input text and audio one by one audio_features, audio_token_len = [], [] @@ -177,8 +173,9 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ) ] - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 65a773480d..014f02ee10 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,8 +1,12 @@ from abc import ABC, abstractmethod -from typing import Generic, TypeVar +from typing import Final, Generic, Optional, Protocol, TypeVar from transformers import PretrainedConfig +from vllm.multimodal.processing import (BaseMultiModalProcessor, + InputProcessingContext, + ProcessingCache) + _C = TypeVar("_C", bound=PretrainedConfig) @@ -27,11 +31,15 @@ class VisionEncoderInfo(ABC, Generic[_C]): raise NotImplementedError @abstractmethod - def get_num_patches(self) -> int: + def get_image_size(self) -> int: raise NotImplementedError @abstractmethod - def get_image_size(self) -> int: + def get_patch_size(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_patch_grid_length(self) -> int: raise NotImplementedError @@ -50,3 +58,26 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) + + +class VisionLanguageConfig(Protocol): + vision_config: Final[PretrainedConfig] + + +class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor): + + def __init__(self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__(ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks) + + vision_config = self._get_hf_config().vision_config + self._vision_encoder_info = vision_encoder_info(vision_config) + + @abstractmethod + def _get_hf_config(self) -> VisionLanguageConfig: + raise NotImplementedError diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 00acb77435..6be046ba77 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -146,6 +146,20 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): def __init__(self, data: Sequence[HfVideoItem]) -> None: super().__init__(data, "video") + def get_num_frames(self, item_idx: int) -> int: + return len(self.get(item_idx)) + + def get_frame_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx)[0] # Assume that the video isn't empty + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + class VideoEmbeddingItems(EmbeddingItems): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index eb7552176e..ebc16b8176 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -16,7 +16,8 @@ from transformers import BatchFeature, ProcessorMixin from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens +from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, + encode_tokens) from vllm.utils import LRUCache, flatten_2d_lists, full_groupby from .inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -69,19 +70,6 @@ def _cached_encode( add_special_tokens=add_special_tokens) -def _decode( - tokenizer: AnyTokenizer, - token_ids: list[int], - *, - skip_special_tokens: bool = False, -) -> str: - """ - Backend-agnostic equivalent of HF's - :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. - """ - return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) - - @lru_cache(maxsize=2048) def _cached_decode( tokenizer: AnyTokenizer, @@ -89,9 +77,9 @@ def _cached_decode( *, skip_special_tokens: bool = False, ) -> str: - return _decode(tokenizer, - list(token_ids), - skip_special_tokens=skip_special_tokens) + return decode_tokens(tokenizer, + list(token_ids), + skip_special_tokens=skip_special_tokens) class _HasModalityAttr(Protocol): @@ -269,8 +257,10 @@ class _PromptReplacementTextMatch(_PromptReplacementMatch): return self.match.end() -class _PlaceholderInfo(NamedTuple): +@dataclass +class _PlaceholderInfo: modality: str + item_idx: int start_idx: int replacement: list[int] @@ -311,12 +301,14 @@ def find_text_matches( def _resolve_matches( prompt: _PromptSeq, - matches: Sequence[_PromptReplacementMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], ) -> list[_PromptReplacementMatch]: """ - Resolve :code:`matches` to ensure that there are no overlapping matches, + Resolve :code:`mm_matches` to ensure that there are no overlapping matches, and sort them such that earlier matches take priority over later ones. """ + matches = [m for matches in mm_matches.values() for m in matches] + seen_matches: list[Optional[_PromptReplacementMatch]] = [None ] * len(prompt) @@ -334,14 +326,15 @@ def _resolve_matches( def _replace_matches( prompt: _S, - matches: Sequence[_PromptReplacementMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], mm_item_counts: Mapping[str, int], ) -> list[_S]: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" out_seqs = list[_S]() prev_end_idx = 0 next_idx_by_modality = defaultdict[str, int](lambda: 0) - for match in _resolve_matches(prompt, matches): + for match in _resolve_matches(prompt, mm_matches): modality = match.modality item_idx = next_idx_by_modality[modality] @@ -371,28 +364,28 @@ def _replace_matches( def replace_token_matches( prompt: list[int], - matches: Sequence[_PromptReplacementTokenMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]], mm_item_counts: Mapping[str, int], ) -> list[int]: - """Apply :code:`prompt_repls` to :code:`prompt`.""" - if not matches: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: return prompt - token_id_seqs = _replace_matches(prompt, matches, mm_item_counts) + token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts) return flatten_2d_lists(token_id_seqs) def replace_text_matches( prompt: str, - matches: Sequence[_PromptReplacementTextMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]], mm_item_counts: Mapping[str, int], ) -> str: - """Apply :code:`prompt_repls` to :code:`prompt`.""" - if not matches: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: return prompt - texts = _replace_matches(prompt, matches, mm_item_counts) + texts = _replace_matches(prompt, mm_matches, mm_item_counts) return "".join(texts) @@ -407,14 +400,14 @@ def _iter_modality_placeholders( return prompt_len = len(prompt) - item_index = 0 + item_idx = 0 start_idx = 0 while start_idx < prompt_len: found = False for repl_info in modality_repls: - replacement = repl_info.get_replacement(item_index) + replacement = repl_info.get_replacement(item_idx) repl_tokens = replacement.token_ids repl_len = len(repl_tokens) end_idx = start_idx + repl_len @@ -425,12 +418,13 @@ def _iter_modality_placeholders( if prompt[start_idx:end_idx] == repl_tokens: yield _PlaceholderInfo( modality=modality, + item_idx=item_idx, start_idx=start_idx, replacement=repl_tokens, ) - item_index += 1 - if item_index >= modal_item_count: + item_idx += 1 + if item_idx >= modal_item_count: return # Exclude overlapping matches @@ -442,28 +436,36 @@ def _iter_modality_placeholders( start_idx += 1 -def iter_placeholders( - prompt_repls: Sequence[_BoundPromptReplacement], +def _iter_placeholders( + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], ) -> Iterable[_PlaceholderInfo]: """ - Yield each set of placeholder tokens found in :code:`prompt`. + For each modality, yield each set of placeholder tokens found in + :code:`prompt`. Note that empty matches are ignored. """ - repls_by_modality = dict(full_groupby_modality(prompt_repls)) - for modality, modal_item_count in mm_item_counts.items(): - if modality in repls_by_modality: + if modality in mm_prompt_repls: yield from _iter_modality_placeholders( prompt, modality, - repls_by_modality[modality], + mm_prompt_repls[modality], modal_item_count, ) +def find_mm_placeholders( + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + prompt: list[int], + mm_item_counts: Mapping[str, int], +) -> Mapping[str, list[_PlaceholderInfo]]: + it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) + return dict(full_groupby_modality(it)) + + @dataclass class ProcessorInputs: """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" @@ -620,7 +622,7 @@ class BaseMultiModalProcessor(ABC): raise NotImplementedError @abstractmethod - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: """ Get the maximum possible number of tokens per data item for each modality. @@ -703,14 +705,14 @@ class BaseMultiModalProcessor(ABC): """ raise NotImplementedError - def _find_placeholders( + def _find_mm_placeholders( self, - all_prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> list[_PlaceholderInfo]: - return list( - iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) + ) -> Mapping[str, list[_PlaceholderInfo]]: + return find_mm_placeholders(mm_prompt_repls, new_token_ids, + mm_item_counts) def _get_hf_mm_data( self, @@ -797,7 +799,10 @@ class BaseMultiModalProcessor(ABC): # Some HF processors (e.g. Qwen2-VL) expect corresponding # multi-modal tokens to be in the prompt text - dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts) + dummy_inputs = self._get_dummy_processor_inputs( + self.ctx.model_config.max_model_len, + mm_missing_counts, + ) _, mm_missing_kwargs = self._apply_hf_processor( prompt_text=dummy_inputs.prompt_text, @@ -889,50 +894,44 @@ class BaseMultiModalProcessor(ABC): mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) - if self.enable_sanity_checks: - mm_item_counts = mm_data_items.get_all_counts() - - for modality, item_count in mm_item_counts.items(): - for item_idx in range(item_count): - try: - mm_kwargs.get_item(modality, item_idx) - except Exception as e: - # Make it easy to set a breakpoint in the debugger - raise e - return prompt_ids, mm_kwargs - def _bind_prompt_replacements( + def _bind_and_group_repls( self, prompt_repls: list[PromptReplacement], - ) -> list[_BoundPromptReplacement]: + ) -> dict[str, list[_BoundPromptReplacement]]: tokenizer = self._get_tokenizer() - return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] + it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls) + return dict(full_groupby_modality(it)) def _always_apply_prompt_replacements(self) -> bool: """ A flag which can be overridden so that :meth:`_apply_prompt_replacements` is always called even if we - detect that HF has performed processing via :meth:`_find_placeholders`. + detect that HF has performed processing via + :meth:`_find_placeholders_by_modality`. - This is useful in cases where :meth:`_find_placeholders` cannot be - reliably used to detect whether HF has performed processing or not. + This is useful in cases where :meth:`_find_placeholders_by_modality` + cannot be reliably used to detect whether HF has performed processing. """ return False def _apply_prompt_replacements( self, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: tokenizer = self._get_tokenizer() - token_matches = find_token_matches(token_ids, prompt_repls) + mm_token_matches = { + modality: find_token_matches(token_ids, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } mm_match_counts = { modality: len(matches) - for modality, matches in full_groupby_modality(token_matches) + for modality, matches in mm_token_matches.items() } # If the search text does not represent a special token, @@ -951,32 +950,92 @@ class BaseMultiModalProcessor(ABC): ): # yapf: disable token_ids = replace_token_matches( token_ids, - token_matches, + mm_token_matches, mm_item_counts, ) - text = _decode(tokenizer, token_ids) - matched_repls = [match.prompt_repl for match in token_matches] + text = decode_tokens(tokenizer, token_ids) + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_token_matches.items() + } else: - text = _decode(tokenizer, token_ids) + text = decode_tokens(tokenizer, token_ids) - text_matches = find_text_matches(text, prompt_repls) + mm_text_matches = { + modality: find_text_matches(text, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } text = replace_text_matches( text, - text_matches, + mm_text_matches, mm_item_counts, ) token_ids = encode_tokens(tokenizer, text, add_special_tokens=False) - matched_repls = [match.prompt_repl for match in text_matches] + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_text_matches.items() + } - placeholders = self._find_placeholders(matched_repls, token_ids, - mm_item_counts) + placeholders = self._find_mm_placeholders( + matched_repls, + token_ids, + mm_item_counts, + ) return token_ids, text, placeholders + def _validate_mm_kwargs( + self, + mm_kwargs: MultiModalKwargs, + mm_item_counts: Mapping[str, int], + ) -> None: + for modality, item_count in mm_item_counts.items(): + if modality in mm_kwargs.modalities: + items = mm_kwargs.get_items(modality) + else: + items = [] + + if len(items) != item_count: + raise RuntimeError( + f"Expected there to be {item_count} {modality} items in " + f"keyword arguments corresponding to {item_count} " + f"{modality} data items, but only found {len(items)}! " + "There is likely a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_mm_fields_config`).") + + def _validate_mm_placeholders( + self, + mm_placeholders: Mapping[str, list[_PlaceholderInfo]], + mm_item_counts: Mapping[str, int], + *, + allow_missing: bool = False, + ) -> Mapping[str, int]: + missing_repl_counts = dict[str, int]() + + for modality, item_count in mm_item_counts.items(): + placeholders = mm_placeholders.get(modality, []) + + if len(placeholders) != item_count and not allow_missing: + raise RuntimeError( + f"Expected there to be {item_count} prompt replacements " + f"corresponding to {item_count} {modality} items, but only " + f"found {len(placeholders)} prompt replacements! Either " + "the prompt text has missing/incorrect tokens for " + "multi-modal inputs, or there is a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_prompt_replacements`).") + + missing_repl_counts[modality] = item_count - len(placeholders) + + return missing_repl_counts + def apply( self, prompt_text: str, @@ -1009,56 +1068,69 @@ class BaseMultiModalProcessor(ABC): hf_processor_mm_kwargs, mm_kwargs, ) - prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls) + mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls) + + mm_item_counts = mm_items.get_all_counts() + self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + + hf_mm_placeholders = self._find_mm_placeholders( + mm_prompt_repls, + prompt_ids, + mm_item_counts, + ) + + if self._always_apply_prompt_replacements(): + mm_missing_repl_counts = mm_item_counts + mm_missing_repls = dict(mm_prompt_repls) + else: + mm_missing_repl_counts = self._validate_mm_placeholders( + hf_mm_placeholders, + mm_item_counts, + allow_missing=True, + ) + + mm_missing_repls = dict[str, list[_BoundPromptReplacement]]() + for modality, missing_repl_count in mm_missing_repl_counts.items(): + if missing_repl_count == 0: + mm_missing_repls[modality] = [] + elif missing_repl_count == mm_item_counts.get(modality, 0): + mm_missing_repls[modality] = mm_prompt_repls[modality] + else: + raise ValueError("Partial prompt replacement within " + f"{modality=} is not supported") # If HF processor already inserts placeholder tokens, # there is no need for us to insert them - mm_item_counts = mm_items.get_all_counts() - all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, - mm_item_counts) - - if all_placeholders and not self._always_apply_prompt_replacements(): + if all(len(repls) == 0 for repls in mm_missing_repls.items()): tokenizer = self._get_tokenizer() - prompt_text = _decode(tokenizer, prompt_ids) + prompt_text = decode_tokens(tokenizer, prompt_ids) + mm_placeholders = hf_mm_placeholders else: ( prompt_ids, prompt_text, - all_placeholders, + missing_mm_placeholders, ) = self._apply_prompt_replacements( prompt_ids, - prompt_repls, - mm_item_counts, + mm_missing_repls, + mm_missing_repl_counts, ) - mm_placeholders = dict[str, list[PlaceholderRange]]() - err_suffix = ("This suggests a problem with your implementation of " - "the merged multi-modal processor for this model, " - "particularly in the `_get_prompt_replacements` method.") + mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders} - for modality, placeholders in full_groupby_modality(all_placeholders): - if modality not in mm_items: - raise AssertionError( - f"Expected no placeholders for {modality=}, " - f"but found {placeholders=}. Input items: {mm_items}" - f"\n{err_suffix}") + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) - if len(placeholders) != len(mm_items[modality]): - raise AssertionError( - f"Expected length of {placeholders=} for {modality=} " - f"to equal that of input items: {mm_items[modality]}" - f"\n{err_suffix}") - - mm_placeholders[modality] = [ - item.to_range() for item in placeholders - ] + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() + } return MultiModalInputsV2( type="multimodal", prompt=prompt_text, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_placeholders=mm_placeholders, + mm_placeholders=mm_placeholder_ranges, ) def _get_dummy_audios( @@ -1092,8 +1164,9 @@ class BaseMultiModalProcessor(ABC): return [video] * num_videos @abstractmethod - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: """ @@ -1121,12 +1194,25 @@ class BaseMultiModalProcessor(ABC): return mm_limits + def _get_dummy_mm_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalInputsV2: + processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts) + + return self.apply( + prompt_text=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) + def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData mm_counts = self._get_and_validate_dummy_mm_counts() - mm_max_tokens_per_item = self.get_mm_max_tokens_per_item() + mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( "The keys returned by `get_supported_mm_limits`" @@ -1134,13 +1220,7 @@ class BaseMultiModalProcessor(ABC): "returned by `get_mm_max_tokens_per_item` " f"({set(mm_max_tokens_per_item.keys())})") - processor_inputs = self._get_dummy_mm_inputs(mm_counts) - mm_inputs = self.apply( - prompt_text=processor_inputs.prompt_text, - mm_data=processor_inputs.mm_data, - hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, - ) - + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) prompt_token_ids = mm_inputs["prompt_token_ids"] placeholders_by_modality = mm_inputs["mm_placeholders"] @@ -1171,6 +1251,12 @@ class BaseMultiModalProcessor(ABC): "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, total_len, total_placeholders_by_modality) + return DummyData( + seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), + multi_modal_data=None, + multi_modal_placeholders=None, + ) + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) return DummyData( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 073d49d7d2..fb4389dc4d 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -223,7 +223,8 @@ class MultiModalRegistry: if self.has_processor(model_config): tokenizer = cached_get_tokenizer(model_config.tokenizer) processor = self.create_processor(model_config, tokenizer) - return processor.get_mm_max_tokens_per_item() + seq_len = model_config.max_model_len + return processor.get_mm_max_tokens_per_item(seq_len) return { key: plugin.get_max_multimodal_tokens(model_config) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 42b2f095bc..97920f42ec 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -21,6 +21,19 @@ AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, MistralTokenizer] +def decode_tokens( + tokenizer: AnyTokenizer, + token_ids: list[int], + *, + skip_special_tokens: bool = False, +) -> str: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. + """ + return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + + def encode_tokens( tokenizer: AnyTokenizer, text: str, From ba214dffbeec070051b61c1985ce6342c947f598 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 4 Jan 2025 23:45:57 +0800 Subject: [PATCH 013/309] [Bugfix] Fix precision error in LLaVA-NeXT (#11735) Signed-off-by: DarkLight1337 --- .../processing/test_llava_next.py | 3 +-- vllm/model_executor/models/llava_next.py | 14 +++++++---- vllm/model_executor/models/llava_onevision.py | 23 ++++++++++++------- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py index 6772130c9b..6c8d300717 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -15,10 +15,9 @@ def processor_for_llava_next(): return LlavaNextMultiModalProcessor -# FIXME: image_size [(198, 176), (176, 198)] @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183)]) + (488, 183), (198, 176), (176, 198)]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_prompt_replacements( processor_for_llava_next, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 3769f04f94..f79021596f 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -2,6 +2,7 @@ from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) +import numpy as np import torch import torch.nn as nn from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor @@ -139,16 +140,21 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): current_height = npatches * num_patch_height current_width = npatches * num_patch_width - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height + # NOTE: HF resizes based on float32 + original_aspect_ratio = np.array(original_width / original_height, + dtype=np.float32) + current_aspect_ratio = np.array(current_width / current_height, + dtype=np.float32) if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width + scale_factor = np.array(current_width / original_width, + dtype=np.float32) new_height = int(original_height * scale_factor) padding = (current_height - new_height) // 2 current_height -= 2 * padding else: - scale_factor = current_height / original_height + scale_factor = np.array(current_height / original_height, + dtype=np.float32) new_width = int(original_width * scale_factor) padding = (current_width - new_width) // 2 current_width -= 2 * padding diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 1e51e09a24..5a3cdadc47 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -3,6 +3,7 @@ from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) +import numpy as np import torch import torch.nn as nn from transformers import (BatchFeature, LlavaOnevisionConfig, @@ -127,18 +128,24 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): current_height = npatches * num_patch_height current_width = npatches * num_patch_width - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height + # NOTE: HF resizes based on float32 + original_aspect_ratio = np.array(original_width / original_height, + dtype=np.float32) + current_aspect_ratio = np.array(current_width / current_height, + dtype=np.float32) + if original_aspect_ratio > current_aspect_ratio: - new_height = int(original_height * - (current_width / original_width)) + scale_factor = np.array(current_width / original_width, + dtype=np.float32) + new_height = int(original_height * scale_factor) padding = (current_height - new_height) // 2 - current_height -= padding * 2 + current_height -= 2 * padding else: - new_width = int(original_width * - (current_height / original_height)) + scale_factor = np.array(current_height / original_height, + dtype=np.float32) + new_width = int(original_width * scale_factor) padding = (current_width - new_width) // 2 - current_width -= padding * 2 + current_width -= 2 * padding unpadded_features = current_height * current_width newline_features = current_height From 65c08928c2db934b18f7c6f5eeb02617826fae8e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 4 Jan 2025 23:46:21 +0800 Subject: [PATCH 014/309] [Model] Remove unnecessary weight initialization logic (#11736) Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/layers/resampler.py | 20 ++++---------------- vllm/model_executor/models/aria.py | 5 +---- vllm/model_executor/models/minicpmv.py | 2 -- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index aae806f6af..a67713c320 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -27,7 +27,7 @@ Shared resampler perceiver network used in multimodal models and related helpers for sincos positional embeddings. -Example models: Qwen (Qwen-VL), Minicpmv2.0 +Example models: Qwen (Qwen-VL), MiniCPM-V 2.0 """ import math from functools import partial @@ -37,7 +37,6 @@ import numpy as np import torch import torch.nn.functional as F from torch import nn -from torch.nn.init import trunc_normal_ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig @@ -169,8 +168,8 @@ class BaseResampler(nn.Module): self.embed_dim = embed_dim self.num_heads = num_heads - self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) - trunc_normal_(self.query, std=0.02) + self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim)) + if kv_dim is not None and kv_dim != embed_dim: self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, @@ -190,16 +189,7 @@ class BaseResampler(nn.Module): self.ln_post = norm_layer(embed_dim) if do_post_projection else None self.proj = nn.Parameter( (embed_dim**-0.5) * - torch.randn(embed_dim, embed_dim)) if do_post_projection else None - - def _init_weights(self, m: nn.Module) -> None: - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) + torch.empty(embed_dim, embed_dim)) if do_post_projection else None def _repeat(self, query, N: int): return query.unsqueeze(1).repeat(1, N, 1) @@ -240,8 +230,6 @@ class Resampler2(BaseResampler): self.pos_embed = nn.Parameter( torch.from_numpy(pos_embed_arr).requires_grad_(False)) - self.apply(self._init_weights) - def forward( self, x: torch.Tensor, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 2fd4262a9d..8f5fd64a90 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -3,7 +3,6 @@ from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple, import torch import torch.nn as nn -from torch.nn.init import trunc_normal_ from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata @@ -216,9 +215,7 @@ class AriaProjector(nn.Module): self.num_heads = num_heads self.query = nn.Parameter( - torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)) - - trunc_normal_(self.query, std=0.02) + torch.empty(max(patch_to_query_dict.values()), self.embed_dim)) self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 7120225025..8f36437d47 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -141,8 +141,6 @@ class Resampler2_5(BaseResampler): self.max_size = max_size self._set_2d_pos_cache(self.max_size) - self.apply(self._init_weights) - def _set_2d_pos_cache(self, max_size: Tuple[int, int], device: torch.types.Device = "cpu") -> None: From 47831430cc943cd470d38d27f8c69a5782795ec3 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 5 Jan 2025 00:07:59 +0800 Subject: [PATCH 015/309] [Bugfix][V1] Fix test_kv_cache_utils.py (#11738) Signed-off-by: Jee Jee Li --- tests/v1/core/test_kv_cache_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index faa3a91de1..2ed70b4299 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -147,12 +147,12 @@ def test_generate_block_hash_extra_keys(): # Test with no extra keys extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) - assert extra_keys == (("hash1", 0), ) + assert extra_keys == ("hash1", ) assert next_mm_idx == 1 # Test with partial overlap extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0) - assert extra_keys == (("hash1", 3), ) + assert extra_keys == ("hash1", ) assert next_mm_idx == 1 # Test with no overlap @@ -162,7 +162,7 @@ def test_generate_block_hash_extra_keys(): # Test with multiple extra keys extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0) - assert extra_keys == (("hash1", 0), ("hash2", 0)) + assert extra_keys == ('hash1', 'hash2') assert next_mm_idx == 2 @@ -216,11 +216,11 @@ def test_hash_request_tokens(): # Check the first block assert block_hashes[0].token_ids == (0, 1, 2) - assert block_hashes[0].extra_keys == (("hash1", 0), ) + assert block_hashes[0].extra_keys == ("hash1", ) # Check the second block assert block_hashes[1].token_ids == (3, 4, 5) - assert block_hashes[1].extra_keys == (("hash2", 0), ) + assert block_hashes[1].extra_keys == ("hash2", ) def test_hash_request_tokens_no_mm_inputs(): From 4068f4b5b5dc5e2d1114be0cbb126bc44fb4e906 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Sat, 4 Jan 2025 17:20:34 -0800 Subject: [PATCH 016/309] [MISC] Replace c10::optional with std::optional (#11730) Signed-off-by: Lu Fang --- csrc/attention/paged_attention_v1.cu | 4 +- csrc/attention/paged_attention_v2.cu | 4 +- csrc/cpu/attention.cpp | 8 ++-- csrc/cpu/quant.cpp | 10 ++-- csrc/cpu/torch_bindings.cpp | 6 +-- .../epilogue/scaled_mm_epilogues_c2x.hpp | 6 +-- .../epilogue/scaled_mm_epilogues_c3x.hpp | 6 +-- csrc/cutlass_extensions/torch_utils.hpp | 2 +- csrc/mamba/causal_conv1d/causal_conv1d.cu | 24 +++++----- csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 22 ++++----- csrc/ops.h | 46 +++++++++---------- .../compressed_tensors/int8_quant_kernels.cu | 4 +- .../cutlass_w8a8/scaled_mm_c2x.cu | 18 ++++---- .../cutlass_w8a8/scaled_mm_c3x.cu | 6 +-- .../cutlass_w8a8/scaled_mm_entry.cu | 30 ++++++------ csrc/quantization/machete/generate.py | 2 +- .../machete/machete_mm_kernel.cuh | 10 ++-- .../machete/machete_mm_launcher.cuh | 24 +++++----- .../machete/machete_prepack_launcher.cuh | 2 +- csrc/quantization/machete/machete_pytorch.cu | 26 +++++------ csrc/rocm/attention.cu | 4 +- csrc/rocm/ops.h | 2 +- csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu | 2 +- csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 4 +- 24 files changed, 136 insertions(+), 136 deletions(-) diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index cb1a069942..27321148f6 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -53,7 +53,7 @@ void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, + const std::optional& alibi_slopes, float k_scale, float v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { @@ -176,7 +176,7 @@ void paged_attention_v1( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index c457bdb890..a453b2243e 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -54,7 +54,7 @@ void paged_attention_v2_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, + const std::optional& alibi_slopes, float k_scale, float v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { @@ -187,7 +187,7 @@ void paged_attention_v2( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e21832ba75..ef5b14088c 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes) { + const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -459,7 +459,7 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, - int max_seq_len, const c10::optional& alibi_slopes) { + int max_seq_len, const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -781,7 +781,7 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index d9aed657a3..33b1637832 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& b, // [IC, OC], column-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] - const c10::optional& bias // [OC] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) // Checks for conformality @@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] const torch::Tensor& azp_adj, // [OC] - const c10::optional& azp, // [1] or [M] - const c10::optional& bias // [OC] + const std::optional& azp, // [1] or [M] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) // Checks for conformality @@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] const torch::Tensor& scale, - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); @@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] torch::Tensor& scale, // [..., 1] - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 03beefbc6d..74e4d8189d 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, - const c10::optional& bias); + const std::optional& bias); void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, const torch::Tensor& azp_adj, - const c10::optional& azp, - const c10::optional& bias); + const std::optional& azp, + const std::optional& bias); TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp index 26f7423fd7..ef413e6dd7 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -68,7 +68,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { static_assert(std::is_same_v>); using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index c723adf126..c590c66a66 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -67,7 +67,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; static_assert(std::is_same_v> || @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp index 2c78572521..a1ff933cce 100644 --- a/csrc/cutlass_extensions/torch_utils.hpp +++ b/csrc/cutlass_extensions/torch_utils.hpp @@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor, template static inline auto maybe_make_cute_layout( - c10::optional const& tensor, + std::optional const& tensor, std::string_view name = "tensor") { using Layout = decltype(make_cute_layout(*tensor)); diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index dd1e6de2e0..f0e5533bca 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, const at::Tensor x, const at::Tensor weight, const at::Tensor out, - const c10::optional& bias, + const std::optional& bias, bool silu_activation, int64_t pad_slot_id, - const c10::optional& query_start_loc = std::nullopt, - const c10::optional& cache_indices = std::nullopt, - const c10::optional& has_initial_state = std::nullopt) { + const std::optional& query_start_loc = std::nullopt, + const std::optional& cache_indices = std::nullopt, + const std::optional& has_initial_state = std::nullopt) { // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, - const c10::optional &bias_, - const c10::optional &conv_states, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &bias_, + const std::optional &conv_states, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, bool silu_activation, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early @@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, void causal_conv1d_update(const at::Tensor &x, const at::Tensor &conv_state, const at::Tensor &weight, - const c10::optional &bias_, + const std::optional &bias_, bool silu_activation, - const c10::optional &cache_seqlens_, - const c10::optional &conv_state_indices_, + const std::optional &cache_seqlens_, + const std::optional &conv_state_indices_, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early int64_t pad_slot_id) { diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 7162469633..bd0a34119c 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const torch::Tensor out, const torch::Tensor z, const torch::Tensor out_z, - const c10::optional& D, - const c10::optional& delta_bias, + const std::optional& D, + const std::optional& delta_bias, const torch::Tensor ssm_states, bool has_z, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool varlen, int64_t pad_slot_id) { @@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C, - const c10::optional &D_, - const c10::optional &z_, - const c10::optional &delta_bias_, + const std::optional &D_, + const std::optional &z_, + const std::optional &delta_bias_, bool delta_softplus, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, const torch::Tensor &ssm_states, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early diff --git a/csrc/ops.h b/csrc/ops.h index 347c502845..9efd9b0c24 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -33,7 +33,7 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -44,7 +44,7 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability); @@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a); @@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor const& scale, - c10::optional const& azp); + std::optional const& azp); void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, - c10::optional const& azp); + std::optional const& azp); torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, @@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, - c10::optional const& scale_ub); + std::optional const& scale_ub); void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& B, const torch::Tensor& C, - const c10::optional& D_, - const c10::optional& z_, - const c10::optional& delta_bias_, + const std::optional& D_, + const std::optional& z_, + const std::optional& delta_bias_, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, const torch::Tensor& ssm_states, int64_t pad_slot_id); void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight, - const c10::optional& bias_, + const std::optional& bias_, bool silu_activation, - const c10::optional& cache_seqlens_, - const c10::optional& conv_state_indices_, + const std::optional& cache_seqlens_, + const std::optional& conv_state_indices_, int64_t pad_slot_id); void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, - const c10::optional& bias_, - const c10::optional& conv_states, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& bias_, + const std::optional& conv_states, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool silu_activation, int64_t pad_slot_id); #ifndef USE_ROCM diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index e9987535bd..e797858271 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] torch::Tensor const& scale, - c10::optional const& azp) { + std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scale.numel() == 1); @@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] - torch::Tensor& scales, c10::optional const& azp) { + torch::Tensor& scales, std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scales.is_contiguous()); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index dbb72e8bbd..865fef5aee 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 123f4359c0..e18d7d79e5 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 4f7b6588ef..3f2b52624f 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, @@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #if defined CUDA_VERSION && CUDA_VERSION >= 12000 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, @@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #endif bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { @@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && @@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 2df4d18190..a9b5ddf4cb 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) { static inline std::optional maybe_scalartype( - c10::optional const& t) { + std::optional const& t) { if (!t) { return std::nullopt; } else { diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh index d4d19ae5de..e4af067915 100644 --- a/csrc/quantization/machete/machete_mm_kernel.cuh +++ b/csrc/quantization/machete/machete_mm_kernel.cuh @@ -183,11 +183,11 @@ struct MacheteKernelTemplate { torch::Tensor const& A, // MxK matrix torch::Tensor const& B, // KxN prepacked matrix torch::Tensor& D, // MxN matrix - c10::optional const& maybe_g_scales, // scale_KxN matrix - c10::optional const& maybe_g_zeros, // scale_KxN matrix - c10::optional maybe_group_size, - c10::optional const& maybe_ch_scales, // len N vector - c10::optional const& maybe_tok_scales) // len M vector + std::optional const& maybe_g_scales, // scale_KxN matrix + std::optional const& maybe_g_zeros, // scale_KxN matrix + std::optional maybe_group_size, + std::optional const& maybe_ch_scales, // len N vector + std::optional const& maybe_tok_scales) // len M vector { static_assert(!with_group_zeropoints || with_group_scales); diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh index 4b0da5b303..cabe0af46f 100644 --- a/csrc/quantization/machete/machete_mm_launcher.cuh +++ b/csrc/quantization/machete/machete_mm_launcher.cuh @@ -13,23 +13,23 @@ struct MMArgs { torch::Tensor const& A; torch::Tensor const& B; vllm::ScalarType const& b_type; - c10::optional const& maybe_out_type; - c10::optional const& maybe_group_scales; - c10::optional const& maybe_group_zeros; - c10::optional maybe_group_size; - c10::optional const& maybe_channel_scales; - c10::optional const& maybe_token_scales; - c10::optional maybe_schedule; + std::optional const& maybe_out_type; + std::optional const& maybe_group_scales; + std::optional const& maybe_group_zeros; + std::optional maybe_group_size; + std::optional const& maybe_channel_scales; + std::optional const& maybe_token_scales; + std::optional maybe_schedule; }; struct SupportedSchedulesArgs { at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; - c10::optional maybe_group_zeros_type; - c10::optional maybe_channel_scales_type; - c10::optional maybe_token_scales_type; - c10::optional maybe_out_type; + std::optional maybe_group_scales_type; + std::optional maybe_group_zeros_type; + std::optional maybe_channel_scales_type; + std::optional maybe_token_scales_type; + std::optional maybe_out_type; }; torch::Tensor mm_dispatch(MMArgs args); diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh index 3486d28be2..634b651a4d 100644 --- a/csrc/quantization/machete/machete_prepack_launcher.cuh +++ b/csrc/quantization/machete/machete_prepack_launcher.cuh @@ -10,7 +10,7 @@ struct PrepackBArgs { torch::Tensor const& B; at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; + std::optional maybe_group_scales_type; }; template diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index da2c2fb0d3..05a51ee21d 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -10,11 +10,11 @@ using namespace vllm; std::vector supported_schedules( at::ScalarType a_type, int64_t b_type_id, - c10::optional maybe_group_scales_type, - c10::optional maybe_group_zeros_type, - c10::optional maybe_channel_scales_type, - c10::optional maybe_token_scales_type, - c10::optional maybe_out_type) { + std::optional maybe_group_scales_type, + std::optional maybe_group_zeros_type, + std::optional maybe_channel_scales_type, + std::optional maybe_token_scales_type, + std::optional maybe_out_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return supported_schedules_dispatch({ .a_type = a_type, @@ -29,13 +29,13 @@ std::vector supported_schedules( torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, int64_t b_type_id, - c10::optional const& maybe_out_type, - c10::optional const& maybe_group_scales, - c10::optional const& maybe_group_zeros, - c10::optional maybe_group_size, - c10::optional const& maybe_channel_scales, - c10::optional const& maybe_token_scales, - c10::optional maybe_schedule) { + std::optional const& maybe_out_type, + std::optional const& maybe_group_scales, + std::optional const& maybe_group_zeros, + std::optional maybe_group_size, + std::optional const& maybe_channel_scales, + std::optional const& maybe_token_scales, + std::optional maybe_schedule) { ScalarType const b_type = ScalarType::from_id(b_type_id); return mm_dispatch({.A = A, .B = B, @@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, torch::Tensor prepack_B( torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, - c10::optional const& maybe_group_scales_type) { + std::optional const& maybe_group_scales_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return prepack_B_dispatch( {.B = B, diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index b48348a515..0fec9624c4 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -928,7 +928,7 @@ void paged_attention_custom_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, const int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& context_lens, - int max_context_len, const c10::optional& alibi_slopes, + int max_context_len, const std::optional& alibi_slopes, float k_scale, float v_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); @@ -1086,7 +1086,7 @@ void paged_attention( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& context_lens, // [num_seqs] int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale) { const int head_size = query.size(2); if (kv_cache_dtype == "auto") { diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 9f085115a3..34b2f9ce8a 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, double scale, torch::Tensor& block_tables, torch::Tensor& context_lens, int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale); diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index 6223dc8cca..5a1879787c 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index d464b045b8..371de0950b 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, @@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) && From 635b897246da121238454ed4b2bbc87cb4d4166b Mon Sep 17 00:00:00 2001 From: cennn <61925104+cennn@users.noreply.github.com> Date: Sun, 5 Jan 2025 23:09:11 +0800 Subject: [PATCH 017/309] [distributed] remove pynccl's redundant stream (#11744) --- tests/distributed/test_pynccl.py | 5 ++-- .../device_communicators/pynccl.py | 28 ++++++------------- vllm/distributed/parallel_state.py | 3 +- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 36cfe42251..a77b48d5e4 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -137,9 +137,8 @@ def worker_fn_with_cudagraph(): # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') torch.cuda.synchronize() - with torch.cuda.graph( - graph, stream=pynccl_comm.stream), pynccl_comm.change_state( - enable=True): + with torch.cuda.graph(graph), \ + pynccl_comm.change_state(enable=True): a_out = pynccl_comm.all_reduce(a) torch.cuda.synchronize() graph.replay() diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index a6800f93f1..93d96fd8f5 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -51,7 +51,6 @@ class PyNcclCommunicator: if self.world_size == 1: self.available = False self.disabled = True - self.stream = None return try: self.nccl = NCCLLibrary(library_path) @@ -60,7 +59,6 @@ class PyNcclCommunicator: # e.g. in a non-GPU environment self.available = False self.disabled = True - self.stream = None return self.available = True @@ -98,12 +96,12 @@ class PyNcclCommunicator: with torch.cuda.device(device): self.comm: ncclComm_t = self.nccl.ncclCommInitRank( self.world_size, self.unique_id, self.rank) - self.stream = torch.cuda.Stream() + stream = torch.cuda.current_stream() # A small all_reduce for warmup. data = torch.zeros(1, device=device) self.all_reduce(data) - self.stream.synchronize() + stream.synchronize() del data def all_reduce(self, @@ -122,7 +120,7 @@ class PyNcclCommunicator: out_tensor = torch.empty_like(in_tensor) if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()), buffer_type(out_tensor.data_ptr()), in_tensor.numel(), @@ -144,7 +142,7 @@ class PyNcclCommunicator: f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {input_tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclAllGather( buffer_type(input_tensor.data_ptr()), buffer_type(output_tensor.data_ptr()), input_tensor.numel(), @@ -165,7 +163,7 @@ class PyNcclCommunicator: f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {input_tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclReduceScatter( buffer_type(input_tensor.data_ptr()), buffer_type(output_tensor.data_ptr()), output_tensor.numel(), @@ -180,7 +178,7 @@ class PyNcclCommunicator: f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, self.comm, cudaStream_t(stream.cuda_stream)) @@ -192,7 +190,7 @@ class PyNcclCommunicator: f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) @@ -204,7 +202,7 @@ class PyNcclCommunicator: f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() if src == self.rank: sendbuff = buffer_type(tensor.data_ptr()) # NCCL requires the sender also to have a receive buffer @@ -217,9 +215,7 @@ class PyNcclCommunicator: self.comm, cudaStream_t(stream.cuda_stream)) @contextmanager - def change_state(self, - enable: Optional[bool] = None, - stream: Optional[torch.cuda.Stream] = None): + def change_state(self, enable: Optional[bool] = None): """ A context manager to change the state of the communicator. """ @@ -227,15 +223,9 @@ class PyNcclCommunicator: # guess a default value when not specified enable = self.available - if stream is None: - stream = self.stream - old_disable = self.disabled - old_stream = self.stream - self.stream = stream self.disabled = not enable yield self.disabled = old_disable - self.stream = old_stream diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index a0d4235460..dccd3addbc 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -310,8 +310,7 @@ class GroupCoordinator: if not pynccl_comm: maybe_pynccl_context = nullcontext() else: - maybe_pynccl_context = pynccl_comm.change_state( - stream=torch.cuda.current_stream()) + maybe_pynccl_context = pynccl_comm.change_state() with maybe_pynccl_context: yield graph_capture_context From eba17173d34548a39989eae2530dce53496a1f3d Mon Sep 17 00:00:00 2001 From: Lancer <402430575@qq.com> Date: Mon, 6 Jan 2025 00:48:16 +0800 Subject: [PATCH 018/309] fix: [doc] fix typo (#11751) Co-authored-by: Lancer --- vllm/core/block/block_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index dca0b3fe8d..90c1438efb 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -23,7 +23,7 @@ class BlockTable: blocks to initialize the BlockTable with. If not provided, an empty BlockTable is created. max_block_sliding_window (Optional[int], optional): The number of - blocks to keep around for each sequance. If None, all blocks + blocks to keep around for each sequence. If None, all blocks are kept (eg., when sliding window is not used). It should at least fit the sliding window size of the model. From 33fc1e2e86ce5d60940463f8f71daaa61728d3b7 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:35:01 -0500 Subject: [PATCH 019/309] [Frontend] Improve `StreamingResponse` Exception Handling (#11752) --- vllm/entrypoints/openai/serving_chat.py | 4 ++-- vllm/entrypoints/openai/serving_completion.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9ba5eeb770..89a119ac65 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -301,7 +301,7 @@ class OpenAIServingChat(OpenAIServing): ] * num_choices else: tool_parsers = [None] * num_choices - except RuntimeError as e: + except Exception as e: logger.exception("Error in tool parser creation.") data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" @@ -591,7 +591,7 @@ class OpenAIServingChat(OpenAIServing): completion_tokens=num_completion_tokens, total_tokens=num_prompt_tokens + num_completion_tokens) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") data = self.create_streaming_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 17197dce8d..2c9c20caf8 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -371,7 +371,7 @@ class OpenAIServingCompletion(OpenAIServing): # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" From 9e764e7b105a483ebc702cad33922ba8d8c210e1 Mon Sep 17 00:00:00 2001 From: cennn <61925104+cennn@users.noreply.github.com> Date: Mon, 6 Jan 2025 09:05:48 +0800 Subject: [PATCH 020/309] [distributed] remove pynccl's redundant change_state (#11749) --- tests/distributed/test_pynccl.py | 64 ++++++++----------- .../device_communicators/pynccl.py | 17 ----- vllm/distributed/parallel_state.py | 9 +-- 3 files changed, 28 insertions(+), 62 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index a77b48d5e4..a8571a1157 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -59,8 +59,7 @@ def worker_fn(): device=get_world_group().device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) - with pynccl_comm.change_state(enable=True): - tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize() assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn(): group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1] pynccl_comm = PyNcclCommunicator(group=group, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) - with pynccl_comm.change_state(enable=True): - # two groups can communicate independently - if torch.distributed.get_rank() in [0, 1]: - tensor = pynccl_comm.all_reduce(tensor) - tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == 4).cpu().item() - else: - tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == 2).cpu().item() + # two groups can communicate independently + if torch.distributed.get_rank() in [0, 1]: + tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() + else: + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -137,8 +135,7 @@ def worker_fn_with_cudagraph(): # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') torch.cuda.synchronize() - with torch.cuda.graph(graph), \ - pynccl_comm.change_state(enable=True): + with torch.cuda.graph(graph): a_out = pynccl_comm.all_reduce(a) torch.cuda.synchronize() graph.replay() @@ -167,8 +164,7 @@ def all_gather_worker_fn(): for r in range(world_size) ]).to(device) - with pynccl_comm.change_state(enable=True): - pynccl_comm.all_gather(result, tensor) + pynccl_comm.all_gather(result, tensor) torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -205,8 +201,7 @@ def reduce_scatter_worker_fn(): expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] for tensor in all_tensors).to(device) - with pynccl_comm.change_state(enable=True): - pynccl_comm.reduce_scatter(result, tensor) + pynccl_comm.reduce_scatter(result, tensor) torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -233,15 +228,13 @@ def send_recv_worker_fn(): else: tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) - with pynccl_comm.change_state(enable=True): - if pynccl_comm.rank == 0: - pynccl_comm.send(tensor, - dst=(pynccl_comm.rank + 1) % - pynccl_comm.world_size) - else: - pynccl_comm.recv(tensor, - src=(pynccl_comm.rank - 1) % - pynccl_comm.world_size) + + if pynccl_comm.rank == 0: + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) + else: + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() assert torch.all(tensor == 1).cpu().item() @@ -272,15 +265,12 @@ def multiple_send_recv_worker_fn(): 1024, dtype=torch.float32, device=device) - with pynccl_comm.change_state(enable=True): - if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.send(tensor, - dst=(pynccl_comm.rank + 1) % - pynccl_comm.world_size) - else: - pynccl_comm.recv(tensor, - src=(pynccl_comm.rank - 1) % - pynccl_comm.world_size) + if torch.distributed.get_rank() in [0, 1]: + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) + else: + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() if torch.distributed.get_rank() in [0, 2]: assert torch.all(tensor == 1).cpu().item() diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 93d96fd8f5..fda4d007ce 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -1,4 +1,3 @@ -from contextlib import contextmanager from typing import Optional, Union # ===================== import region ===================== @@ -213,19 +212,3 @@ class PyNcclCommunicator: self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) - - @contextmanager - def change_state(self, enable: Optional[bool] = None): - """ - A context manager to change the state of the communicator. - """ - if enable is None: - # guess a default value when not specified - enable = self.available - - old_disable = self.disabled - - self.disabled = not enable - yield - - self.disabled = old_disable diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index dccd3addbc..a837c1dc59 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -305,14 +305,7 @@ class GroupCoordinator: stream.wait_stream(curr_stream) with torch.cuda.stream(stream), maybe_ca_context: - pynccl_comm = self.pynccl_comm - maybe_pynccl_context: Any - if not pynccl_comm: - maybe_pynccl_context = nullcontext() - else: - maybe_pynccl_context = pynccl_comm.change_state() - with maybe_pynccl_context: - yield graph_capture_context + yield graph_capture_context def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: """ From 402d37836059463c7ec8b1e25d40c29138f1dd40 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Jan 2025 10:18:33 +0800 Subject: [PATCH 021/309] [Doc] [1/N] Reorganize Getting Started section (#11645) Signed-off-by: DarkLight1337 --- docs/source/design/arch_overview.md | 3 +-- docs/source/design/multiprocessing.md | 2 +- docs/source/{usage => getting_started}/faq.md | 0 .../cpu-arm.md} | 2 +- .../cpu-x86.md} | 6 +++--- .../gpu-cuda.md} | 4 ++-- .../gpu-rocm.md} | 2 +- .../hpu-gaudi.md} | 4 +++- .../getting_started/installation/index.md | 19 +++++++++++++++++++ .../neuron.md} | 2 +- .../openvino.md} | 4 ++-- .../tpu.md} | 2 +- .../xpu.md} | 2 +- docs/source/getting_started/quickstart.md | 2 +- .../{debugging.md => troubleshooting.md} | 11 ++++++----- docs/source/index.md | 16 ++++------------ docs/source/models/generative_models.md | 2 +- docs/source/models/pooling_models.md | 2 +- docs/source/serving/distributed_serving.md | 2 +- docs/source/usage/spec_decode.md | 4 ++-- docs/source/usage/structured_outputs.md | 2 +- vllm/utils.py | 2 +- 22 files changed, 54 insertions(+), 41 deletions(-) rename docs/source/{usage => getting_started}/faq.md (100%) rename docs/source/getting_started/{arm-installation.md => installation/cpu-arm.md} (92%) rename docs/source/getting_started/{cpu-installation.md => installation/cpu-x86.md} (95%) rename docs/source/getting_started/{installation.md => installation/gpu-cuda.md} (99%) rename docs/source/getting_started/{amd-installation.md => installation/gpu-rocm.md} (99%) rename docs/source/getting_started/{gaudi-installation.md => installation/hpu-gaudi.md} (99%) create mode 100644 docs/source/getting_started/installation/index.md rename docs/source/getting_started/{neuron-installation.md => installation/neuron.md} (99%) rename docs/source/getting_started/{openvino-installation.md => installation/openvino.md} (90%) rename docs/source/getting_started/{tpu-installation.md => installation/tpu.md} (99%) rename docs/source/getting_started/{xpu-installation.md => installation/xpu.md} (98%) rename docs/source/getting_started/{debugging.md => troubleshooting.md} (94%) diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 475a3e5fa9..2f1280c047 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model That code can be found in . -More details on the API server can be found in the {doc}`OpenAI Compatible -Server ` document. +More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document. ## LLM Engine diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index 34564413b3..da87638e5b 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -2,7 +2,7 @@ ## Debugging -Please see the [Debugging Tips](#debugging-python-multiprocessing) +Please see the [Troubleshooting](#troubleshooting-python-multiprocessing) page for information on known issues and how to solve them. ## Introduction diff --git a/docs/source/usage/faq.md b/docs/source/getting_started/faq.md similarity index 100% rename from docs/source/usage/faq.md rename to docs/source/getting_started/faq.md diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/installation/cpu-arm.md similarity index 92% rename from docs/source/getting_started/arm-installation.md rename to docs/source/getting_started/installation/cpu-arm.md index 799b597b3a..a46e2c0106 100644 --- a/docs/source/getting_started/arm-installation.md +++ b/docs/source/getting_started/installation/cpu-arm.md @@ -2,7 +2,7 @@ # Installation for ARM CPUs -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: - CPU backend inference capabilities - Relevant runtime environment variables diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/installation/cpu-x86.md similarity index 95% rename from docs/source/getting_started/cpu-installation.md rename to docs/source/getting_started/installation/cpu-x86.md index c3d3f715ed..bbb2d1872e 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -1,6 +1,6 @@ -(installation-cpu)= +(installation-x86)= -# Installation with CPU +# Installation for x86 CPUs vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: @@ -151,4 +151,4 @@ $ python examples/offline_inference.py $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation/gpu-cuda.md similarity index 99% rename from docs/source/getting_started/installation.md rename to docs/source/getting_started/installation/gpu-cuda.md index 996fb346f4..7ea10bb8b5 100644 --- a/docs/source/getting_started/installation.md +++ b/docs/source/getting_started/installation/gpu-cuda.md @@ -1,6 +1,6 @@ -(installation)= +(installation-cuda)= -# Installation +# Installation for CUDA vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/installation/gpu-rocm.md similarity index 99% rename from docs/source/getting_started/amd-installation.md rename to docs/source/getting_started/installation/gpu-rocm.md index 6d01efbbf8..796911d730 100644 --- a/docs/source/getting_started/amd-installation.md +++ b/docs/source/getting_started/installation/gpu-rocm.md @@ -1,6 +1,6 @@ (installation-rocm)= -# Installation with ROCm +# Installation for ROCm vLLM supports AMD GPUs with ROCm 6.2. diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/installation/hpu-gaudi.md similarity index 99% rename from docs/source/getting_started/gaudi-installation.md rename to docs/source/getting_started/installation/hpu-gaudi.md index 1f2ee62860..94de169f51 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/installation/hpu-gaudi.md @@ -1,4 +1,6 @@ -# Installation with Intel® Gaudi® AI Accelerators +(installation-gaudi)= + +# Installation for Intel® Gaudi® This README provides instructions on running vLLM with Intel Gaudi devices. diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md new file mode 100644 index 0000000000..83de1aff40 --- /dev/null +++ b/docs/source/getting_started/installation/index.md @@ -0,0 +1,19 @@ +(installation-index)= + +# Installation + +vLLM supports the following hardware platforms: + +```{toctree} +:maxdepth: 1 + +gpu-cuda +gpu-rocm +cpu-x86 +cpu-arm +hpu-gaudi +tpu +xpu +openvino +neuron +``` diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/installation/neuron.md similarity index 99% rename from docs/source/getting_started/neuron-installation.md rename to docs/source/getting_started/installation/neuron.md index baaeeb9f53..431f90537f 100644 --- a/docs/source/getting_started/neuron-installation.md +++ b/docs/source/getting_started/installation/neuron.md @@ -1,6 +1,6 @@ (installation-neuron)= -# Installation with Neuron +# Installation for Neuron vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. Paged Attention and Chunked Prefill are currently in development and will be available soon. diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/installation/openvino.md similarity index 90% rename from docs/source/getting_started/openvino-installation.md rename to docs/source/getting_started/installation/openvino.md index 8b43c0a904..60f95fd1c4 100644 --- a/docs/source/getting_started/openvino-installation.md +++ b/docs/source/getting_started/installation/openvino.md @@ -1,8 +1,8 @@ (installation-openvino)= -# Installation with OpenVINO +# Installation for OpenVINO -vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: - Prefix caching (`--enable-prefix-caching`) - Chunked prefill (`--enable-chunked-prefill`) diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/installation/tpu.md similarity index 99% rename from docs/source/getting_started/tpu-installation.md rename to docs/source/getting_started/installation/tpu.md index 4d3ac541c9..bc93c44fea 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/installation/tpu.md @@ -1,6 +1,6 @@ (installation-tpu)= -# Installation with TPU +# Installation for TPUs Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/installation/xpu.md similarity index 98% rename from docs/source/getting_started/xpu-installation.md rename to docs/source/getting_started/installation/xpu.md index 9554ae4b7f..be4e3b9bd1 100644 --- a/docs/source/getting_started/xpu-installation.md +++ b/docs/source/getting_started/installation/xpu.md @@ -1,6 +1,6 @@ (installation-xpu)= -# Installation with XPU +# Installation for XPUs vLLM initially supports basic model inferencing and serving on Intel GPU platform. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 9c8b7e4f59..ff216f8af3 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -23,7 +23,7 @@ $ conda activate myenv $ pip install vllm ``` -Please refer to the {ref}`installation documentation ` for more details on installing vLLM. +Please refer to the [installation documentation](#installation-index) for more details on installing vLLM. (offline-batched-inference)= diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/troubleshooting.md similarity index 94% rename from docs/source/getting_started/debugging.md rename to docs/source/getting_started/troubleshooting.md index 19eb699572..5a0310da0f 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/troubleshooting.md @@ -1,8 +1,8 @@ -(debugging)= +(troubleshooting)= -# Debugging Tips +# Troubleshooting -This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. +This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. ```{note} Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. @@ -47,6 +47,7 @@ You might also need to set `export NCCL_SOCKET_IFNAME=` If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. +(troubleshooting-incorrect-hardware-driver)= ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -139,7 +140,7 @@ A multi-node environment is more complicated than a single-node one. If you see Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. ``` -(debugging-python-multiprocessing)= +(troubleshooting-python-multiprocessing)= ## Python multiprocessing ### `RuntimeError` Exception @@ -150,7 +151,7 @@ If you have seen a warning in your logs like this: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing for more information. ``` diff --git a/docs/source/index.md b/docs/source/index.md index 34f9c4caeb..f390474978 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -50,7 +50,7 @@ For more information, check out the following: - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. -- {ref}`vLLM Meetups `. +- [vLLM Meetups](#meetups) ## Documentation @@ -58,18 +58,11 @@ For more information, check out the following: :caption: Getting Started :maxdepth: 1 -getting_started/installation -getting_started/amd-installation -getting_started/openvino-installation -getting_started/cpu-installation -getting_started/gaudi-installation -getting_started/arm-installation -getting_started/neuron-installation -getting_started/tpu-installation -getting_started/xpu-installation +getting_started/installation/index getting_started/quickstart -getting_started/debugging getting_started/examples/examples_index +getting_started/troubleshooting +getting_started/faq ``` ```{toctree} @@ -110,7 +103,6 @@ usage/structured_outputs usage/spec_decode usage/compatibility_matrix usage/performance -usage/faq usage/engine_args usage/env_vars usage/usage_stats diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 35e0302b86..383299d61b 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) ## Online Inference -Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. - [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 76c96c9edc..12ded68eb3 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -106,7 +106,7 @@ A code example can be found here: for more information. +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. ``` ```{warning} diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md index 8302da81b6..8c52c97a41 100644 --- a/docs/source/usage/spec_decode.md +++ b/docs/source/usage/spec_decode.md @@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). **Conclusion** @@ -195,7 +195,7 @@ can occur due to following factors: **Mitigation Strategies** -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index 7292012e36..26c09bb0d8 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. - `guided_decoding_backend`: used to select the guided decoding backend to use. -You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. +You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: diff --git a/vllm/utils.py b/vllm/utils.py index 8ef07d2c32..aadeddabf8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1938,7 +1938,7 @@ def _check_multiproc_method(): "the `spawn` multiprocessing start method. Setting " "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " "See https://docs.vllm.ai/en/latest/getting_started/" - "debugging.html#python-multiprocessing " + "troubleshooting.html#python-multiprocessing " "for more information.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" From 408e5600158bfa34306cfbd034a3779e488752fa Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sun, 5 Jan 2025 20:49:55 -0800 Subject: [PATCH 022/309] [Bugfix] Remove block size constraint (#11723) --- vllm/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b51f978300..b0ed88cb7f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1015,11 +1015,6 @@ class CacheConfig: raise ValueError( "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") - from vllm.platforms import current_platform - if (current_platform.is_cuda() and self.block_size is not None - and self.block_size > 32): - raise ValueError("CUDA Paged Attention kernel only supports " - f"block sizes up to 32. Got {self.block_size}.") def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": From 06bfb51963953d6ae31b87965bfb91b6eca4fd24 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 6 Jan 2025 14:24:42 +0900 Subject: [PATCH 023/309] [V1] Add BlockTable class (#11693) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/block_table.py | 78 ++++++++++++++++++++++++++++++ vllm/v1/worker/gpu_input_batch.py | 25 ++++------ vllm/v1/worker/gpu_model_runner.py | 16 +++--- 3 files changed, 94 insertions(+), 25 deletions(-) create mode 100644 vllm/v1/worker/block_table.py diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py new file mode 100644 index 0000000000..26a2084b13 --- /dev/null +++ b/vllm/v1/worker/block_table.py @@ -0,0 +1,78 @@ +from typing import List + +import numpy as np +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class BlockTable: + + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_blocks_per_req: int, + pin_memory: bool, + device: torch.device, + ): + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_blocks_per_req = max_num_blocks_per_req + self.pin_memory = pin_memory + self.device = device + + self.block_table = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device=self.device, + dtype=torch.int32, + ) + self.block_table_cpu = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.block_table_np = self.block_table_cpu.numpy() + self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32) + + def append_row( + self, + row_idx: int, + start: int, + block_ids: List[int], + ) -> None: + num_blocks = len(block_ids) + self.block_table_np[row_idx, start:start + num_blocks] = block_ids + self.num_blocks_per_row[row_idx] = start + num_blocks + + def add_row(self, row_idx: int, block_ids: List[int]) -> None: + self.append_row(row_idx, 0, block_ids) + + def move_row(self, src: int, tgt: int) -> None: + num_blocks = self.num_blocks_per_row[src] + self.block_table_np[tgt, :num_blocks] = self.block_table_np[ + src, :num_blocks] + self.num_blocks_per_row[tgt] = num_blocks + + def commit(self, num_reqs: int) -> None: + self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs], + non_blocking=True) + + def clear(self) -> None: + self.block_table.fill_(0) + self.block_table_cpu.fill_(0) + + def get_device_tensor(self) -> torch.Tensor: + """Ruturns the device tensor of the block table.""" + return self.block_table + + def get_cpu_tensor(self) -> torch.Tensor: + """Returns the CPU tensor of the block table.""" + return self.block_table_cpu + + def get_numpy_array(self) -> np.ndarray: + """Returns the numpy array of the block table.""" + return self.block_table_np diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index f8a1427c6c..40494e64b2 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -9,6 +9,7 @@ import torch from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams, SamplingType from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: from vllm.multimodal.inputs import PlaceholderRange @@ -70,19 +71,14 @@ class InputBatch: self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) - # Attention-related. - self.block_table = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device=self.device, - dtype=torch.int32, - ) - self.block_table_cpu_tensor = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device="cpu", - dtype=torch.int32, + # Block table. + self.block_table = BlockTable( + max_num_reqs=max_num_reqs, + max_model_len=max_model_len, + max_num_blocks_per_req=max_num_blocks_per_req, pin_memory=pin_memory, + device=device, ) - self.block_table_cpu = self.block_table_cpu_tensor.numpy() # Sampling-related. self.temperature = torch.empty((max_num_reqs, ), @@ -193,8 +189,7 @@ class InputBatch: self.num_tokens[req_index] = request.num_tokens self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens - num_blocks = len(request.block_ids) - self.block_table_cpu[req_index, :num_blocks] = request.block_ids + self.block_table.add_row(req_index, request.block_ids) sampling_params = request.sampling_params self.temperature_cpu[req_index] = sampling_params.temperature @@ -300,9 +295,7 @@ class InputBatch: self.num_prompt_tokens[last_req_index] self.num_computed_tokens_cpu[ empty_index] = self.num_computed_tokens_cpu[last_req_index] - # TODO(woosuk): Optimize the copy of block_table_cpu. - self.block_table_cpu[empty_index] = self.block_table_cpu[ - last_req_index] + self.block_table.move_row(last_req_index, empty_index) self.temperature_cpu[empty_index] = self.temperature_cpu[ last_req_index] self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 294c76cfb6..31e693235f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -211,10 +211,9 @@ class GPUModelRunner: if num_new_blocks == 0: continue start_index = len(req_state.block_ids) - end_index = start_index + num_new_blocks req_state.block_ids.extend(req_data.new_block_ids) - self.input_batch.block_table_cpu[ - req_index, start_index:end_index] = req_data.new_block_ids + self.input_batch.block_table.append_row(req_index, start_index, + req_data.new_block_ids) req_ids_to_add: List[str] = [] # Add new requests to the cached states. @@ -275,9 +274,7 @@ class GPUModelRunner: # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - self.input_batch.block_table[:num_reqs].copy_( - self.input_batch.block_table_cpu_tensor[:num_reqs], - non_blocking=True) + self.input_batch.block_table.commit(num_reqs) # Get the number of scheduled tokens for each request. # TODO: The Python loop can be slow. Optimize. @@ -333,8 +330,8 @@ class GPUModelRunner: # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. - block_numbers = (self.input_batch.block_table_cpu_tensor.flatten() - [block_table_indices].numpy()) + block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, @@ -450,7 +447,8 @@ class GPUModelRunner: query_start_loc=query_start_loc, max_seq_len=max_seq_len, seq_start_loc=seq_start_loc, - block_table=self.input_batch.block_table[:num_reqs], + block_table=( + self.input_batch.block_table.get_device_tensor()[:num_reqs]), slot_mapping=slot_mapping, use_cascade=use_cascade, common_prefix_len=common_prefix_len, From f8fcca100beada88136944976da88f47f363acab Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sun, 5 Jan 2025 23:12:38 -0800 Subject: [PATCH 024/309] [Misc] Fix typo for valid_tool_parses (#11753) Signed-off-by: Rui Qiao --- vllm/entrypoints/openai/api_server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e942b47553..047f699e4f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -767,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None: if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) - valide_tool_parses = ToolParserManager.tool_parsers.keys() + valid_tool_parses = ToolParserManager.tool_parsers.keys() if args.enable_auto_tool_choice \ - and args.tool_call_parser not in valide_tool_parses: + and args.tool_call_parser not in valid_tool_parses: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " - f"(chose from {{ {','.join(valide_tool_parses)} }})") + f"(chose from {{ {','.join(valid_tool_parses)} }})") # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. From 022c5c6944bcf28ac4d0d28ce14f2b559358be52 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sun, 5 Jan 2025 23:59:16 -0800 Subject: [PATCH 025/309] [V1] Refactor get_executor_cls (#11754) --- tests/v1/engine/test_engine_core.py | 6 +++--- tests/v1/engine/test_engine_core_client.py | 6 +++--- vllm/v1/engine/async_llm.py | 21 +-------------------- vllm/v1/engine/llm_engine.py | 20 +------------------- vllm/v1/executor/abstract.py | 19 ++++++++++++++++++- 5 files changed, 26 insertions(+), 46 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 954cec734b..8dd9b23fbd 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -8,8 +8,8 @@ from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core import EngineCore +from vllm.v1.executor.abstract import Executor if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", @@ -43,7 +43,7 @@ def test_engine_core(monkeypatch): """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) vllm_config = engine_args.create_engine_config() - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, executor_class=executor_class) @@ -149,7 +149,7 @@ def test_engine_core_advanced_sampling(monkeypatch): """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) vllm_config = engine_args.create_engine_config() - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, executor_class=executor_class) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 20d4e6f63b..5a21806e57 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -11,8 +11,8 @@ from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.executor.abstract import Executor if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", @@ -84,7 +84,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3) vllm_config = engine_args.create_engine_config( UsageContext.UNKNOWN_CONTEXT) - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) client = EngineCoreClient.make_client( multiprocess_mode=multiprocessing_mode, asyncio_mode=False, @@ -152,7 +152,7 @@ async def test_engine_core_client_asyncio(monkeypatch): engine_args = EngineArgs(model=MODEL_NAME) vllm_config = engine_args.create_engine_config( usage_context=UsageContext.UNKNOWN_CONTEXT) - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) client = EngineCoreClient.make_client( multiprocess_mode=True, asyncio_mode=True, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0696caf883..b963ba74f1 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -22,7 +22,6 @@ from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.executor.ray_utils import initialize_ray_cluster logger = init_logger(__name__) @@ -105,7 +104,7 @@ class AsyncLLM(EngineClient): else: vllm_config = engine_config - executor_class = cls._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) # Create the AsyncLLM. return cls( @@ -127,24 +126,6 @@ class AsyncLLM(EngineClient): if handler := getattr(self, "output_handler", None): handler.cancel() - @classmethod - def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: - executor_class: Type[Executor] - distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "ray": - initialize_ray_cluster(vllm_config.parallel_config) - from vllm.v1.executor.ray_executor import RayExecutor - executor_class = RayExecutor - elif distributed_executor_backend == "mp": - from vllm.v1.executor.multiproc_executor import MultiprocExecutor - executor_class = MultiprocExecutor - else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor - return executor_class - async def add_request( self, request_id: str, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 0bd9b52c9b..8ced3a34d2 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -89,7 +89,7 @@ class LLMEngine: # Create the engine configs. vllm_config = engine_args.create_engine_config(usage_context) - executor_class = cls._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) if VLLM_ENABLE_V1_MULTIPROCESSING: logger.debug("Enabling multiprocessing for LLMEngine.") @@ -103,24 +103,6 @@ class LLMEngine: stat_loggers=stat_loggers, multiprocess_mode=enable_multiprocessing) - @classmethod - def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: - executor_class: Type[Executor] - distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "ray": - from vllm.v1.executor.ray_executor import RayExecutor - executor_class = RayExecutor - elif distributed_executor_backend == "mp": - from vllm.v1.executor.multiproc_executor import MultiprocExecutor - executor_class = MultiprocExecutor - else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor - - return executor_class - def get_num_unfinished_requests(self) -> int: return self.detokenizer.get_num_unfinished_requests() diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 564d0447f1..5d74d4b01f 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Tuple +from typing import Tuple, Type from vllm.config import VllmConfig from vllm.v1.outputs import ModelRunnerOutput @@ -8,6 +8,23 @@ from vllm.v1.outputs import ModelRunnerOutput class Executor(ABC): """Abstract class for executors.""" + @staticmethod + def get_class(vllm_config: VllmConfig) -> Type["Executor"]: + executor_class: Type[Executor] + distributed_executor_backend = ( + vllm_config.parallel_config.distributed_executor_backend) + if distributed_executor_backend == "ray": + from vllm.v1.executor.ray_executor import RayExecutor + executor_class = RayExecutor + elif distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_executor import MultiprocExecutor + executor_class = MultiprocExecutor + else: + assert (distributed_executor_backend is None) + from vllm.v1.executor.uniproc_executor import UniprocExecutor + executor_class = UniprocExecutor + return executor_class + @abstractmethod def __init__(self, vllm_config: VllmConfig) -> None: raise NotImplementedError From 9c749713f6990a9f9d12e526d9bfc2669dfa8ee6 Mon Sep 17 00:00:00 2001 From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com> Date: Mon, 6 Jan 2025 01:59:36 -0600 Subject: [PATCH 026/309] [mypy] Forward pass function type hints in lora (#11740) Signed-off-by: lucast2021 Co-authored-by: lucast2021 --- vllm/lora/layers.py | 12 +++++++++--- vllm/lora/models.py | 3 ++- vllm/model_executor/layers/linear.py | 4 +++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 102e40d3f4..a933ccaecf 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -405,7 +405,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): self.output_size = self.base_layer.output_size self.n_slices = 1 - def forward(self, input_): + def forward( + self, input_: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ReplicatedLinearWithLoRA Args: @@ -496,7 +498,9 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): bias = bias[start_idx:end_idx] return bias - def forward(self, input_): + def forward( + self, input_: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ColumnParallelLinear Args: @@ -833,7 +837,9 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: return bias - def forward(self, input_): + def forward( + self, input_: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of RowParallelLinear Args: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9cfcc6bba7..5b7225bdc8 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import math import os import re from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Sequence, Type +from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union import safetensors.torch import torch @@ -219,6 +219,7 @@ class LoRAModel(AdapterModel): config["vllm_max_position_embeddings"] = max_position_embeddings peft_helper = PEFTHelper.from_dict(config) + unexpected_modules: List[Union[list[str], str]] if os.path.isfile(lora_tensor_path): tensors: Dict[str, torch.Tensor] = {} # Find unexpected modules. diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 33b221b994..48cfb1b221 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -238,7 +238,9 @@ class ReplicatedLinear(LinearBase): assert param.size() == loaded_weight.size() param.data.copy_(loaded_weight) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward( + self, x: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: bias = self.bias if not self.skip_bias_add else None assert self.quant_method is not None output = self.quant_method.apply(self, x, bias) From 2a622d704a4270c8d6fab057e8a545ed86ac35b7 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Mon, 6 Jan 2025 00:01:22 -0800 Subject: [PATCH 027/309] k8s-config: Update the secret to use stringData (#11679) Signed-off-by: Suraj Deshmukh --- docs/source/serving/deploying_with_k8s.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md index 77f848088e..5f9b0e4f55 100644 --- a/docs/source/serving/deploying_with_k8s.md +++ b/docs/source/serving/deploying_with_k8s.md @@ -43,7 +43,7 @@ metadata: name: hf-token-secret namespace: default type: Opaque -data: +stringData: token: "REPLACE_WITH_TOKEN" ``` From 996357e4808ca5eab97d4c97c7d25b3073f46aab Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Jan 2025 16:02:21 +0800 Subject: [PATCH 028/309] [VLM] Separate out profiling-related logic (#11746) Signed-off-by: DarkLight1337 --- tests/multimodal/test_processing.py | 7 +- vllm/model_executor/models/aria.py | 91 ++++--- vllm/model_executor/models/blip2.py | 80 +++--- vllm/model_executor/models/chameleon.py | 86 ++++--- vllm/model_executor/models/fuyu.py | 85 ++++--- vllm/model_executor/models/llava.py | 185 +++++++++----- vllm/model_executor/models/llava_next.py | 75 +++--- .../model_executor/models/llava_next_video.py | 148 ++++++----- vllm/model_executor/models/llava_onevision.py | 174 +++++++------ vllm/model_executor/models/phi3v.py | 106 ++++---- vllm/model_executor/models/qwen2_audio.py | 100 +++++--- vllm/model_executor/models/qwen2_vl.py | 235 +++++++++++------- vllm/model_executor/models/ultravox.py | 91 ++++--- vllm/model_executor/models/vision.py | 37 +-- vllm/multimodal/processing.py | 152 ++++------- vllm/multimodal/profiling.py | 121 +++++++++ vllm/multimodal/registry.py | 2 +- 17 files changed, 1036 insertions(+), 739 deletions(-) create mode 100644 vllm/multimodal/profiling.py diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index b32faa699e..75d878217b 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -586,9 +586,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ) processor = processor_factory(ctx, cache=None) + profiler = processor.profiling_info mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) - processor.get_supported_mm_limits = mock_supported_mm_limits + profiler.get_supported_mm_limits = mock_supported_mm_limits if is_valid: exc_ctx = nullcontext() @@ -596,7 +597,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): exc_ctx = pytest.raises(ValueError, match="this model only supports") with exc_ctx: - processor._get_and_validate_dummy_mm_counts() + profiler.get_mm_limits() @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @@ -723,7 +724,7 @@ def _test_processing_cache_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor._get_dummy_processor_inputs( + prompt = baseline_processor.profiling_info.get_dummy_processor_inputs( model_config.max_model_len, mm_counts, ).prompt_text diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 8f5fd64a90..2e649f10c0 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -24,8 +24,9 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) @@ -444,54 +445,33 @@ def build_mm_projector(config: PretrainedConfig): ) -class AriaMultiModalProcessor(BaseMultiModalProcessor): +class AriaProcessingMixin(ProcessingMixin): + + def _get_hf_config(self): + return self.ctx.get_hf_config() + + def _get_vision_config(self) -> AriaVisionConfig: + return self._get_hf_config().vision_config + + def _get_num_image_tokens(self) -> int: + hf_config = self._get_hf_config() + return max(hf_config.projector_patch_to_query_dict.values()) + + +class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def _get_num_image_tokens(self) -> int: - hf_config = self.ctx.get_hf_config() - return max(hf_config.projector_patch_to_query_dict.values()) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - pixel_mask=MultiModalFieldConfig.batched("image"), - ) - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config() - image_token_id = hf_config.image_token_index - - num_image_tokens = self._get_num_image_tokens() - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=[image_token_id] * num_image_tokens, - ) - ] - - def _get_dummy_processor_inputs( + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config() - vision_config: AriaVisionConfig = hf_config.vision_config + vision_config = self._get_vision_config() max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) @@ -512,6 +492,41 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor): ) +class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return AriaProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_mask=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self._get_hf_config() + image_token_id = hf_config.image_token_index + + num_image_tokens = self._get_num_image_tokens() + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=[image_token_id] * num_image_tokens, + ) + ] + + @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index b3ecb2f22d..fd45783f16 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -4,8 +4,8 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, import torch import torch.nn as nn -from transformers import (BatchFeature, Blip2Config, Blip2Processor, - Blip2QFormerConfig, apply_chunking_to_forward) +from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig, + apply_chunking_to_forward) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -18,8 +18,9 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .blip import BlipVisionModel @@ -396,20 +397,52 @@ class Blip2QFormerModel(nn.Module): return sequence_output -class Blip2MultiModalProcessor(BaseMultiModalProcessor): +class Blip2ProcessingMixin(ProcessingMixin): + + def _get_hf_config(self): + return self.ctx.get_hf_config(Blip2Config) + + def _get_num_image_tokens(self) -> int: + hf_config = self._get_hf_config() + return hf_config.num_query_tokens + + +class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def _get_num_image_tokens(self) -> int: - hf_config = self.ctx.get_hf_config(Blip2Config) - return hf_config.num_query_tokens - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} - def _get_hf_processor(self) -> Blip2Processor: - return self.ctx.get_hf_processor(Blip2Processor) + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self._get_hf_config() + vision_config = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + +class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Blip2ProfilingInfo(self.ctx) def _get_mm_fields_config( self, @@ -427,13 +460,13 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - max_image_tokens = self._get_num_image_tokens() + num_image_tokens = self._get_num_image_tokens() return [ PromptReplacement( modality="image", target="", - replacement="" * max_image_tokens + "", + replacement="" * num_image_tokens + "", ) ] @@ -457,29 +490,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor): return result - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - - max_image_size = vision_config.image_size - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=max_image_size, - height=max_image_size, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="", - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 1ad44678a5..73ed73b61e 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -31,8 +31,9 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once @@ -48,54 +49,33 @@ class ChameleonImagePixelInputs(TypedDict): """Shape: `(batch_size * num_images, num_channels, height, width)`""" -class ChameleonMultiModalProcessor(BaseMultiModalProcessor): +class ChameleonProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} + def _get_hf_config(self): + return self.ctx.get_hf_config(ChameleonConfig) + + def _get_hf_processor(self): + return self.ctx.get_hf_processor(ChameleonProcessor) def _get_num_image_tokens(self) -> int: processor = self._get_hf_processor() return processor.image_seq_length + +class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} - def _get_hf_processor(self) -> ChameleonProcessor: - return self.ctx.get_hf_processor(ChameleonProcessor) - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: - processor = self._get_hf_processor() - - return [ - PromptReplacement( - modality="image", - target="", - replacement="".join([ - processor.image_start_token, - processor.image_token * self._get_num_image_tokens(), - processor.image_end_token, - ]), - ) - ] - - def _get_dummy_processor_inputs( + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - config = self.ctx.get_hf_config(ChameleonConfig) + config = self._get_hf_config() width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) @@ -112,6 +92,40 @@ class ChameleonMultiModalProcessor(BaseMultiModalProcessor): mm_data=mm_data, ) + +class ChameleonMultiModalProcessor(ChameleonProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return ChameleonProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + processor = self._get_hf_processor(**hf_processor_mm_kwargs) + + return [ + PromptReplacement( + modality="image", + target="", + replacement="".join([ + processor.image_start_token, + processor.image_token * self._get_num_image_tokens(), + processor.image_end_token, + ]), + ) + ] + def apply( self, prompt_text: str, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7cd58fbc7c..c937fcb097 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -35,8 +35,9 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import ImageProcessorItems, ImageSize from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -63,18 +64,16 @@ class FuyuImagePatchInputs(TypedDict): """ -class FuyuMultiModalProcessor(BaseMultiModalProcessor): +class FuyuProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} + def _get_hf_config(self): + return self.ctx.get_hf_config(FuyuConfig) - def _get_image_target_size(self) -> ImageSize: - processor = self._get_hf_processor() - image_processor: FuyuImageProcessor = processor.image_processor + def _get_hf_processor(self): + return self.ctx.get_hf_processor(FuyuProcessor) - target_size = image_processor.size - return ImageSize(width=target_size["width"], - height=target_size["height"]) + def _get_image_processor(self) -> FuyuImageProcessor: + return self._get_hf_processor().image_processor def _get_image_feature_grid_size( self, @@ -82,7 +81,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): image_width: int, image_height: int, ) -> tuple[int, int]: - target_width, target_height = self._get_image_target_size() + image_processor = self._get_image_processor() + target_width = image_processor.size["width"] + target_height = image_processor.size["height"] if not (image_width <= target_width and image_height <= target_height): height_scale_factor = target_height / image_height @@ -96,8 +97,14 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): nrows = math.ceil(image_height / 30) return ncols, nrows + +class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - target_width, target_height = self._get_image_target_size() + target_width, target_height = self._get_image_size_with_most_features() max_ncols, max_nrows = self._get_image_feature_grid_size( image_width=target_width, @@ -107,8 +114,36 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): return {"image": max_image_tokens} - def _get_hf_processor(self) -> FuyuProcessor: - return self.ctx.get_hf_processor(FuyuProcessor) + def _get_image_size_with_most_features(self) -> ImageSize: + image_processor = self._get_image_processor() + return ImageSize(width=image_processor.size["width"], + height=image_processor.size["height"]) + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = self._get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + +class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return FuyuProfilingInfo(self.ctx) def _call_hf_processor( self, @@ -161,7 +196,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(FuyuConfig) + hf_config = self._get_hf_config() bos_token_id = hf_config.bos_token_id tokenizer = self._get_tokenizer() @@ -208,26 +243,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor): return result - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - target_width, target_height = self._get_image_target_size() - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="", - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index d522378e0b..4299af8cd0 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,4 @@ -from abc import abstractmethod +from abc import ABC, abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) @@ -13,6 +13,7 @@ from transformers.models.pixtral import PixtralProcessor from vllm.attention import AttentionMetadata from vllm.config import VllmConfig +from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -25,9 +26,10 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize) -from vllm.multimodal.processing import (InputProcessingContext, +from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessingCache, - ProcessorInputs, PromptReplacement) + ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel @@ -37,7 +39,7 @@ from .pixtral import (PixtralHFVisionModel, from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import BaseVisionLanguageMultiModalProcessor +from .vision import get_vision_encoder_info class LlavaImagePixelInputs(TypedDict): @@ -94,30 +96,42 @@ class LlavaMultiModalProjector(nn.Module): class LlavaLikeConfig(Protocol): vision_config: Final[PretrainedConfig] + image_token_index: Final[int] vision_feature_select_strategy: Final[str] - vision_feature_layer: Final[Union[int, List[int]]] + vision_feature_layer: Final[Union[int, list[int]]] -class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): +class LlavaLikeProcessor(Protocol): + image_token: Final[str] + + +class BaseLlavaProcessingMixin(ProcessingMixin, ABC): + + def _get_hf_config(self) -> LlavaLikeConfig: + return self.ctx.get_hf_config(LlavaConfig) + + def _get_vision_encoder_info(self): + return get_vision_encoder_info(self._get_hf_config()) @abstractmethod - def _get_hf_config(self) -> LlavaLikeConfig: + def _get_hf_processor(self) -> LlavaLikeProcessor: raise NotImplementedError - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self._get_max_image_tokens()} - - def _get_mm_fields_config( + def _get_num_image_tokens( self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + vision_encoder_info = self._get_vision_encoder_info() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), ) def _apply_feature_select_strategy( @@ -133,31 +147,38 @@ class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): msg = f"Unexpected feature select strategy: {strategy!r}" raise NotImplementedError(msg) - def _get_max_image_tokens(self) -> int: - hf_config = self._get_hf_config() - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_max_image_tokens(), +class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self._get_max_image_tokens()} + + def _get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self._get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + + def _get_max_image_tokens(self) -> int: + target_width, target_height = self._get_image_size_with_most_features() + + return self._get_num_image_tokens( + image_width=target_width, + image_height=target_height, ) - def _get_dummy_image_size(self) -> ImageSize: - image_size = self._vision_encoder_info.get_image_size() - return ImageSize(image_size, image_size) - - @abstractmethod - def _get_image_token(self) -> str: - raise NotImplementedError - - def _get_dummy_processor_inputs( + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - image_token = self._get_image_token() - target_width, target_height = self._get_dummy_image_size() + processor = self._get_hf_processor() + image_token = processor.image_token + target_width, target_height = self._get_image_size_with_most_features() mm_data = { "image": @@ -172,32 +193,32 @@ class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): ) -class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): +class LlavaProcessingMixin(BaseLlavaProcessingMixin): - def _get_hf_config(self) -> LlavaConfig: - return self.ctx.get_hf_config(LlavaConfig) - - def _get_hf_processor(self) -> LlavaProcessor: + def _get_hf_processor(self): return self.ctx.get_hf_processor(LlavaProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - def _get_num_image_tokens( +class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo): + pass + + +class BaseLlavaMultiModalProcessor(LlavaProcessingMixin, + BaseMultiModalProcessor): + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_profiling_info(self) -> BaseProfilingInfo: + raise NotImplementedError + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_mm_fields_config( self, - *, - image_width: int, - image_height: int, - ) -> int: - hf_config = self._get_hf_config() - - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_num_image_tokens( - image_width=image_width, - image_height=image_height, - ), - ) + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + raise NotImplementedError def _get_prompt_replacements( self, @@ -232,16 +253,37 @@ class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): ] -class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): +class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): - def _get_hf_config(self) -> LlavaConfig: - return self.ctx.get_hf_config(LlavaConfig) + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaProfilingInfo(self.ctx) - def _get_hf_processor(self) -> PixtralProcessor: + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + +class PixtralHFProcessingMixin(BaseLlavaProcessingMixin): + + def _get_hf_processor(self): return self.ctx.get_hf_processor(PixtralProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token + +class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo): + pass + + +class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return PixtralHFProfilingInfo(self.ctx) def _call_hf_processor( self, @@ -270,6 +312,16 @@ class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): return processed_outputs + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, @@ -316,7 +368,7 @@ def _build_llava_or_pixtral_hf_processor( *, cache: Optional[ProcessingCache] = None, enable_sanity_checks: bool = True, -) -> BaseLlavaMultiModalProcessor: +) -> BaseMultiModalProcessor: hf_config = ctx.get_hf_config(LlavaConfig) if isinstance(hf_config.vision_config, PixtralVisionConfig): @@ -663,16 +715,13 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): class MantisMultiModalProcessor(LlavaMultiModalProcessor): - def _get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaProcessor) - def apply( self, prompt_text: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - hf_config = self.ctx.get_hf_config(LlavaConfig) + hf_config = self._get_hf_config() image_token_id = hf_config.image_token_index # Assume that it doesn't depend on the image size diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index f79021596f..c76ec164a3 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,6 +1,6 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import numpy as np import torch @@ -17,12 +17,14 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors from vllm.multimodal.parse import ImageSize +from vllm.multimodal.profiling import BaseProfilingInfo from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector, - init_vision_tower_for_llava) +from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin, + BaseLlavaProfilingInfo, LlavaLikeConfig, + LlavaMultiModalProjector, init_vision_tower_for_llava) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, init_vllm_registered_model, maybe_prefix) @@ -60,36 +62,18 @@ LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, LlavaNextImageEmbeddingInputs] -class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): +class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): + image_grid_pinpoints: Final[list[list[int]]] - def _get_hf_config(self) -> LlavaNextConfig: + +class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): + + def _get_hf_config(self) -> LlavaNextLikeConfig: return self.ctx.get_hf_config(LlavaNextConfig) - def _get_hf_processor(self) -> LlavaNextProcessor: + def _get_hf_processor(self): return self.ctx.get_hf_processor(LlavaNextProcessor) - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - - def _get_max_image_tokens(self) -> int: - largest_feature_size, _ = self._get_pinpoint_with_most_features() - return largest_feature_size - - def _get_dummy_image_size(self) -> ImageSize: - _, pinpoint = self._get_pinpoint_with_most_features() - return pinpoint - # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 def _get_num_image_tokens( self, @@ -98,7 +82,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): image_height: int, ) -> int: hf_config = self._get_hf_config() - vision_encoder_info = self._vision_encoder_info + vision_encoder_info = self._get_vision_encoder_info() base_feature_size = self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, @@ -140,7 +124,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): current_height = npatches * num_patch_height current_width = npatches * num_patch_width - # NOTE: HF resizes based on float32 + # NOTE: Use float32 to remain consistent with HF output original_aspect_ratio = np.array(original_width / original_height, dtype=np.float32) current_aspect_ratio = np.array(current_width / current_height, @@ -164,11 +148,10 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): return (unpadded_features, newline_features) - def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: - """ - Get the grid pinpoint with the most features and - the corresponding feature size. - """ + +class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo): + + def _get_image_size_with_most_features(self) -> ImageSize: hf_config = self._get_hf_config() largest_feature_size, largest_feature_pinpoint = 0, None @@ -183,7 +166,25 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): if largest_feature_size == 0 or largest_feature_pinpoint is None: raise ValueError("Cannot have a largest feature size of 0!") - return largest_feature_size, largest_feature_pinpoint + return largest_feature_pinpoint + + +class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin, + BaseLlavaMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaNextProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) @MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index ee6b89f0d4..6e82cee1c9 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -15,11 +15,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, - VideoEmbeddingItems, VideoProcessorItems) -from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems, + VideoProcessorItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -28,7 +31,7 @@ from .llava import init_vision_tower_for_llava from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import BaseVisionLanguageMultiModalProcessor +from .vision import get_vision_encoder_info class LlavaNextVideoPixelInputs(TypedDict): @@ -44,30 +47,17 @@ class LlavaNextVideoPixelInputs(TypedDict): """ -class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): +class LlavaNextVideoProcessingMixin(ProcessingMixin): - def _get_hf_config(self) -> LlavaNextVideoConfig: + def _get_hf_config(self): return self.ctx.get_hf_config(LlavaNextVideoConfig) - def _get_hf_processor(self) -> LlavaNextVideoProcessor: + def _get_vision_encoder_info(self): + return get_vision_encoder_info(self._get_hf_config()) + + def _get_hf_processor(self): return self.ctx.get_hf_processor(LlavaNextVideoProcessor) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"video": 1} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - num_frames = self._get_dummy_num_frames(seq_len) - max_video_tokens = self._get_max_video_tokens(num_frames) - - return {"video": max_video_tokens} - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) - def _get_num_frame_tokens( self, *, @@ -77,7 +67,8 @@ class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): hf_config = self._get_hf_config() spatial_pool_stride = hf_config.spatial_pool_stride - patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + vision_encoder_info = self._get_vision_encoder_info() + patch_grid_length = vision_encoder_info.get_patch_grid_length() pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) return pooled_grid_length * pooled_grid_length @@ -96,18 +87,43 @@ class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): return num_frame_tokens * num_frames - def _get_max_video_tokens(self, num_frames: int) -> int: - return self._get_num_video_tokens(image_width=999999, - image_height=999999, - num_frames=num_frames) + +class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin, + BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"video": 1} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self._get_image_size_with_most_features() + + max_video_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + ) + + return {"video": max_video_tokens} + + def _get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self._get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + num_frames = 0 while True: next_num_frames = num_frames + 1 + next_max_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) - if self._get_max_video_tokens(next_num_frames) > max_tokens: + if next_max_tokens > max_tokens: break num_frames = next_num_frames @@ -122,12 +138,45 @@ class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): return max(max_total_frames // max(max_videos, 1), 1) - def _get_dummy_image_size(self) -> ImageSize: - image_size = self._vision_encoder_info.get_image_size() - return ImageSize(image_size, image_size) + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_videos = mm_counts.get("video", 0) - def _get_video_token(self) -> str: - return self._get_hf_processor().video_token + processor = self._get_hf_processor() + video_token = processor.video_token + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=video_token * num_videos, + mm_data=mm_data, + ) + + +class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaNextVideoProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) def _get_prompt_replacements( self, @@ -162,36 +211,11 @@ class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): ), ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_videos = mm_counts.get("video", 0) - - video_token = self._get_video_token() - target_width, target_height = self._get_dummy_image_size() - - mm_data = { - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - num_videos=num_videos, - ) - } - - return ProcessorInputs( - prompt_text=video_token * num_videos, - mm_data=mm_data, - ) - # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): - def __init__(self, config): + def __init__(self, config: LlavaNextVideoConfig): super().__init__() mode = config.spatial_pool_mode @@ -209,7 +233,7 @@ class LlavaNextVideoPooler(nn.Module): raise ValueError( f"Unknown pooling mode: {mode}. Expected [`average`, `max`]") - def forward(self, image_features): + def forward(self, image_features: torch.Tensor): ori_width = int( math.sqrt(image_features.shape[1] * self.image_size // self.image_size)) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5a3cdadc47..6dccc1e0d3 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,7 +1,7 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import numpy as np import torch @@ -21,15 +21,16 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, VideoProcessorItems) -from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, - PromptReplacement) +from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import init_vision_tower_for_llava -from .llava_next import LlavaNextMultiModalProcessor +from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava +from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor, + LlavaNextProcessingMixin) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -82,40 +83,18 @@ LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs, LlavaOnevisionVideoPixelInputs] -class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): +class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol): + video_token_index: Final[int] - def _get_hf_config(self) -> LlavaOnevisionConfig: + +class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): + + def _get_hf_config(self) -> LlavaOnevisionLikeConfig: return self.ctx.get_hf_config(LlavaOnevisionConfig) - def _get_hf_processor(self) -> LlavaOnevisionProcessor: + def _get_hf_processor(self): return self.ctx.get_hf_processor(LlavaOnevisionProcessor) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - max_image_tokens = self._get_max_image_tokens() - - num_frames = self._get_dummy_num_frames(seq_len) - max_video_tokens = self._get_max_video_tokens(num_frames) - - return { - "image": max_image_tokens, - "video": max_video_tokens, - } - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - pixel_values_videos=MultiModalFieldConfig.batched("video"), - ) - def _get_num_unpadded_features( self, *, @@ -128,7 +107,7 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): current_height = npatches * num_patch_height current_width = npatches * num_patch_width - # NOTE: HF resizes based on float32 + # NOTE: Use float32 to remain consistent with HF output original_aspect_ratio = np.array(original_width / original_height, dtype=np.float32) current_aspect_ratio = np.array(current_width / current_height, @@ -167,7 +146,8 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): hf_config = self._get_hf_config() spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) - patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + vision_encoder_info = self._get_vision_encoder_info() + patch_grid_length = vision_encoder_info.get_patch_grid_length() pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) return pooled_grid_length * pooled_grid_length @@ -186,18 +166,33 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): return num_frame_tokens * num_frames + 1 # Newline token - def _get_max_video_tokens(self, num_frames: int) -> int: - return self._get_num_video_tokens(image_width=999999, - image_height=999999, - num_frames=num_frames) + +class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, + BaseLlavaProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self._get_max_image_tokens(), + "video": self._get_max_video_tokens(seq_len), + } def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + num_frames = 0 while True: next_num_frames = num_frames + 1 + next_max_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) - if self._get_max_video_tokens(next_num_frames) > max_tokens: + if next_max_tokens > max_tokens: break num_frames = next_num_frames @@ -215,8 +210,65 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): return max(max_total_frames // max(max_videos, 1), 1) - def _get_video_token(self) -> str: - return self._get_hf_processor().video_token + def _get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + + return self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + ) + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + processor = self._get_hf_processor() + image_token = processor.image_token + video_token = processor.video_token + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, + ) + + +class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin, + LlavaNextMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaOnevisionProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), + ) def _call_hf_processor( self, @@ -235,7 +287,8 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): mm_kwargs=mm_kwargs, ) - video_token = self._get_video_token() + processor = self._get_hf_processor() + video_token = processor.video_token # LLaVA-OneVision processor doesn't support multiple videos # with different sizes when converting back to tensors @@ -303,37 +356,6 @@ class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): ), ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - - image_token = self._get_image_token() - video_token = self._get_video_token() - target_width, target_height = self._get_dummy_image_size() - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images), - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - num_videos=num_videos, - ) - } - - return ProcessorInputs( - prompt_text=image_token * num_images + video_token * num_videos, - mm_data=mm_data, - ) - class LlavaOnevisionMultiModalProjector(nn.Module): diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7aa9d58d1d..c8418c14e5 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -14,7 +14,7 @@ # limitations under the License. from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -28,22 +28,23 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement, _BoundPromptReplacement, _PlaceholderInfo) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, @@ -54,10 +55,6 @@ logger = init_logger(__name__) # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 32044 -# Result in the max possible feature size (h:w = 16:1) -MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000 -MAX_IMAGE_FEATURE_SIZE_WIDTH = 50 - CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0, hidden_act="quick_gelu", hidden_size=1024, @@ -305,10 +302,17 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): return image_features_hd_newline -class Phi3VMultiModalProcessor(BaseMultiModalProcessor): +class Phi3VProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} + def _get_hf_processor( + self, + *, + num_crops: Optional[int] = None, + ) -> ProcessorMixin: + if num_crops is not None: + return self.ctx.get_hf_processor(num_crops=num_crops) + + return self.ctx.get_hf_processor() def _get_num_image_tokens( self, @@ -323,23 +327,55 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): height=image_height, ) + +class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self._get_image_size_with_most_features() + max_image_tokens = self._get_num_image_tokens( - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + image_width=target_width, + image_height=target_height, ) return {"image": max_image_tokens} - def _get_hf_processor( - self, - *, - num_crops: Optional[int] = None, - ) -> ProcessorMixin: - if num_crops is not None: - return self.ctx.get_hf_processor(num_crops=num_crops) + def _get_image_size_with_most_features(self) -> ImageSize: + # Result in the max possible feature size (h:w = 16:1) + return ImageSize(height=8000, width=50) - return self.ctx.get_hf_processor() + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + hf_processor = self._get_hf_processor() + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + + return ProcessorInputs( + prompt_text="".join(image_tokens[:num_images]), + mm_data=mm_data, + ) + + +class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Phi3VProfilingInfo(self.ctx) def _call_hf_processor( self, @@ -377,10 +413,10 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() + hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) image_tokens: list[str] = hf_processor.img_tokens # type: ignore tokenizer = self._get_tokenizer() @@ -442,28 +478,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): return token_ids, text, placeholders - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - - data = dummy_image_for_clip( - CLIP_VIT_LARGE_PATCH14_336_CONFIG, - num_images, - image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) - - hf_processor = self._get_hf_processor() - image_tokens: list[str] = hf_processor.img_tokens # type: ignore - - return ProcessorInputs( - prompt_text="".join(image_tokens[:num_images]), - mm_data=data, - ) - def apply( self, prompt_text: str, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index bc3bb1f79b..a7bb3425ed 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -20,8 +20,8 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from functools import cached_property -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -40,8 +40,9 @@ from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -79,17 +80,10 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths -class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): +class Qwen2AudioProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"audio": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) - max_source_positions = hf_config.audio_config.max_source_positions - max_output_lengths = (max_source_positions - 2) // 2 + 1 - - return {"audio": max_output_lengths} + def _get_hf_config(self): + return self.ctx.get_hf_config(Qwen2AudioConfig) def _get_hf_processor( self, @@ -99,8 +93,57 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): ) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor) - def _get_feature_extractor(self) -> WhisperFeatureExtractor: - return self._get_hf_processor().feature_extractor # type: ignore + def _get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) + feature_extractor = hf_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + +class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + hf_config = self._get_hf_config() + max_source_positions = hf_config.audio_config.max_source_positions + max_output_lengths = (max_source_positions - 2) // 2 + 1 + + return {"audio": max_output_lengths} + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self._get_feature_extractor() + + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + return ProcessorInputs( + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, + ) + + +class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Qwen2AudioProfilingInfo(self.ctx) def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() @@ -110,7 +153,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): self, prompt: str, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], + mm_kwargs: Mapping[str, Any], ) -> BatchFeature: mm_data = dict(mm_data) audios = mm_data.pop("audios", []) @@ -118,7 +161,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): if audios: mm_data["audios"] = audios - feature_extractor = self._get_feature_extractor() + feature_extractor = self._get_feature_extractor(**mm_kwargs) mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, @@ -151,7 +194,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) + hf_config = self._get_hf_config() placeholder = hf_config.audio_token_index feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") @@ -191,27 +234,6 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): # tokens than the number of audio items) return True - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() - - sampling_rate = feature_extractor.sampling_rate - audio_len = feature_extractor.chunk_length * sampling_rate - num_audios = mm_counts.get("audio", 0) - - mm_data = { - "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) - } - - return ProcessorInputs( - prompt_text="<|AUDIO|>" * num_audios, - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index abca85e0e2..a5c2fb9e84 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -59,8 +59,9 @@ from vllm.multimodal.inputs import (ImageItem, ModalityData, from vllm.multimodal.parse import (ImageSize, ModalityDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -708,10 +709,44 @@ class Qwen2MultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): +class Qwen2VLProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} + def _get_hf_config(self): + return self.ctx.get_hf_config(Qwen2VLConfig) + + def _get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + ) -> Qwen2VLProcessor: + hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + + if min_pixels: + image_processor.min_pixels = min_pixels + if max_pixels: + image_processor.max_pixels = max_pixels + if max_pixels or min_pixels: + image_processor.size = { + "min_pixels": image_processor.min_pixels, + "max_pixels": image_processor.max_pixels, + } + + return hf_processor + + def _get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + ): + hf_processor = self._get_hf_processor(min_pixels=min_pixels, + max_pixels=max_pixels) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + return image_processor def _get_vision_info( self, @@ -721,14 +756,13 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): num_frames: int = 1, do_resize: bool = True, ) -> tuple[ImageSize, int]: - hf_config = self.ctx.get_hf_config(Qwen2VLConfig) + hf_config = self._get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size temporal_patch_size = vision_config.temporal_patch_size - hf_processor = self._get_hf_processor() - image_processor = self._get_image_processor(hf_processor) + image_processor = self._get_image_processor() if do_resize: resized_height, resized_width = smart_resize( @@ -753,7 +787,45 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): return preprocessed_size, num_vision_tokens - def _get_dummy_image_size(self) -> ImageSize: + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + _, num_image_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + ) + return num_image_tokens + + def _get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + _, num_video_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + num_frames=num_frames, + ) + return num_video_tokens + + +class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self._get_max_image_tokens(), + "video": self._get_max_video_tokens(seq_len), + } + + def _get_image_size_with_most_features(self) -> ImageSize: max_image_size, _ = self._get_vision_info( image_width=9999999, image_height=9999999, @@ -761,27 +833,27 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): return max_image_size def _get_max_image_tokens(self) -> int: - _, max_image_tokens = self._get_vision_info( - image_width=9999999, - image_height=9999999, - ) - return max_image_tokens + target_width, target_height = self._get_image_size_with_most_features() - def _get_max_video_tokens(self, num_frames: int) -> int: - _, max_video_tokens = self._get_vision_info( - image_width=9999999, - image_height=9999999, - num_frames=num_frames, + return self._get_num_image_tokens( + image_width=target_width, + image_height=target_height, ) - return max_video_tokens def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + num_frames = 0 while True: next_num_frames = num_frames + 1 + next_max_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) - if self._get_max_video_tokens(next_num_frames) > max_tokens: + if next_max_tokens > max_tokens: break num_frames = next_num_frames @@ -797,56 +869,73 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - return max(max_total_frames // max(max_videos, 1), 1) + num_frames = max(max_total_frames // max(max_videos, 1), 1) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - max_image_tokens = self._get_max_image_tokens() + # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 + if num_frames > 1 and num_frames % 2 == 1: + num_frames += 1 - num_frames = self._get_dummy_num_frames(seq_len) - max_video_tokens = self._get_max_video_tokens(num_frames) + return num_frames - return { - "image": max_image_tokens, - "video": max_video_tokens, + def _get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + + return self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + ) + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token + video_token: str = hf_processor.video_token + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) } + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, + ) + + +class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Qwen2VLProfilingInfo(self.ctx) + def _get_data_parser(self) -> MultiModalDataParser: return Qwen2MultiModalDataParser() - def _get_image_processor(self, hf_processor: Qwen2VLProcessor): - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, Qwen2VLImageProcessor) - return image_processor - - def _get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - ) -> Qwen2VLProcessor: - hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = self._get_image_processor(hf_processor) - - if min_pixels: - image_processor.min_pixels = min_pixels - if max_pixels: - image_processor.max_pixels = max_pixels - if max_pixels or min_pixels: - image_processor.size = { - "min_pixels": image_processor.min_pixels, - "max_pixels": image_processor.max_pixels, - } - - return hf_processor - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() - image_processor = self._get_image_processor(hf_processor) + hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self._get_image_processor(**hf_processor_mm_kwargs) # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered @@ -901,38 +990,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): video_grid_thw=MultiModalFieldConfig.batched("video"), ) - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - - hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token - video_token: str = hf_processor.video_token - target_width, target_height = self._get_dummy_image_size() - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images), - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - num_videos=num_videos, - ) - } - - return ProcessorInputs( - prompt_text=image_token * num_images + video_token * num_videos, - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 6ad4661e3b..ba823acecb 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,8 +3,8 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.utils.checkpoint @@ -26,8 +26,9 @@ from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig @@ -55,7 +56,30 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, UltravoxAudioEmbeddingInputs] -class UltravoxMultiModalProcessor(BaseMultiModalProcessor): +class UltravoxProcessingMixin(ProcessingMixin): + + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> ProcessorMixin: + return self.ctx.get_hf_processor() + + def _get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) + audio_processor = hf_processor.audio_processor # type: ignore + feature_extractor = audio_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + +class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} @@ -67,17 +91,33 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): return {"audio": max_audio_tokens} - def _get_hf_processor( + def get_dummy_processor_inputs( self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> ProcessorMixin: - return self.ctx.get_hf_processor() + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self._get_feature_extractor() - def _get_feature_extractor(self) -> WhisperFeatureExtractor: - hf_processor = self._get_hf_processor() - return hf_processor.audio_processor.feature_extractor # type: ignore + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + return ProcessorInputs( + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, + ) + + +class UltravoxMultiModalProcessor(UltravoxProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return UltravoxProfilingInfo(self.ctx) def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() @@ -155,10 +195,10 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() + hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) placeholder = hf_processor.audio_token_replacement # type: ignore def get_replacement_ultravox(item_idx: int): @@ -173,27 +213,6 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ) ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() - - sampling_rate = feature_extractor.sampling_rate - audio_len = feature_extractor.chunk_length * sampling_rate - num_audios = mm_counts.get("audio", 0) - - mm_data = { - "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) - } - - return ProcessorInputs( - prompt_text="<|audio|>" * num_audios, - mm_data=mm_data, - ) - class StackAudioFrames(nn.Module): """ diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 014f02ee10..8516c9f706 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,12 +1,8 @@ from abc import ABC, abstractmethod -from typing import Final, Generic, Optional, Protocol, TypeVar +from typing import Final, Generic, Protocol, TypeVar from transformers import PretrainedConfig -from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, - ProcessingCache) - _C = TypeVar("_C", bound=PretrainedConfig) @@ -43,12 +39,18 @@ class VisionEncoderInfo(ABC, Generic[_C]): raise NotImplementedError -def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: +class VisionLanguageConfig(Protocol): + vision_config: Final[PretrainedConfig] + + +def get_vision_encoder_info( + hf_config: VisionLanguageConfig) -> VisionEncoderInfo: # Avoid circular imports from .clip import CLIPEncoderInfo, CLIPVisionConfig from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig from .siglip import SiglipEncoderInfo, SiglipVisionConfig + vision_config = hf_config.vision_config if isinstance(vision_config, CLIPVisionConfig): return CLIPEncoderInfo(vision_config) if isinstance(vision_config, PixtralVisionConfig): @@ -58,26 +60,3 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) - - -class VisionLanguageConfig(Protocol): - vision_config: Final[PretrainedConfig] - - -class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor): - - def __init__(self, - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: - super().__init__(ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks) - - vision_config = self._get_hf_config().vision_config - self._vision_encoder_info = vision_encoder_info(vision_config) - - @abstractmethod - def _get_hf_config(self) -> VisionLanguageConfig: - raise NotImplementedError diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index ebc16b8176..933c1d3aff 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -8,11 +8,10 @@ from functools import lru_cache from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union import numpy as np -import numpy.typing as npt import torch from blake3 import blake3 from PIL import Image -from transformers import BatchFeature, ProcessorMixin +from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger @@ -24,6 +23,7 @@ from .inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, MultiModalKwargsItem, PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser +from .profiling import BaseProfilingInfo logger = init_logger(__name__) @@ -466,14 +466,6 @@ def find_mm_placeholders( return dict(full_groupby_modality(it)) -@dataclass -class ProcessorInputs: - """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" - prompt_text: str - mm_data: MultiModalDataDict - hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) - - class ProcessingCache: def __init__(self, capacity: int) -> None: @@ -585,9 +577,33 @@ class ProcessingCache: self._cache.put(cache_key, output_kwargs) -class BaseMultiModalProcessor(ABC): +class ProcessingMixin: + """ + Contains helper functions to perform processing. + + Not to be confused with :class:`transformers.ProcessorMixin`. + """ + ctx: InputProcessingContext + + def _get_tokenizer(self) -> AnyTokenizer: + return self.ctx.tokenizer + + def _get_hf_config(self) -> PretrainedConfig: + return self.ctx.get_hf_config() + + def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + """ + Subclasses can override this method to handle + specific kwargs from model config or user inputs. + """ + return self.ctx.get_hf_processor(**kwargs) + + +class BaseMultiModalProcessor(ProcessingMixin, ABC): """ Abstract base class to process multi-modal inputs to be used in vLLM. + + Not to be confused with :class:`transformers.ProcessorMixin`. """ def __init__(self, @@ -601,6 +617,9 @@ class BaseMultiModalProcessor(ABC): self.cache = cache self.enable_sanity_checks = enable_sanity_checks + self.data_parser = self._get_data_parser() + self.profiling_info = self._get_profiling_info() + def __call__( self, prompt: str, @@ -609,32 +628,9 @@ class BaseMultiModalProcessor(ABC): ) -> MultiModalInputsV2: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) - @abstractmethod - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - """ - Return the maximum supported number of items for each modality. - - A value of `None` means unlimited number of items. - - Omitting a modality from the returned dictionary means that - it is not supported at all. - """ - raise NotImplementedError - - @abstractmethod - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - """ - Get the maximum possible number of tokens per data item - for each modality. - - The dictionary returned by this method should have the same - keys as that returned by :meth:`get_supported_mm_limits`. - """ - raise NotImplementedError - def _get_data_parser(self) -> MultiModalDataParser: """ - Construct a data parser to preprocess multi-modal data items + Construct a parser to preprocess multi-modal data items before passing them to :meth:`_get_hf_mm_data`. You can support additional modalities by creating a subclass @@ -642,15 +638,12 @@ class BaseMultiModalProcessor(ABC): """ return MultiModalDataParser() - def _get_hf_processor(self) -> ProcessorMixin: + def _get_profiling_info(self) -> BaseProfilingInfo: """ - Subclasses can add keyword arguments to this method to accept - additional kwargs from model config or user inputs. + Get the profiling information to find the worst-case memory usage of + the model. """ - return self.ctx.get_hf_processor() - - def _get_tokenizer(self) -> AnyTokenizer: - return self.ctx.tokenizer + raise NotImplementedError def _to_mm_items( self, @@ -660,8 +653,7 @@ class BaseMultiModalProcessor(ABC): Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` before passing them to :meth:`_get_hf_mm_data`. """ - parser = self._get_data_parser() - mm_items = parser.parse_mm_data(mm_data) + mm_items = self.data_parser.parse_mm_data(mm_data) mm_limits = self.ctx.get_mm_config().limit_per_prompt for modality, items in mm_items.items(): @@ -799,7 +791,7 @@ class BaseMultiModalProcessor(ABC): # Some HF processors (e.g. Qwen2-VL) expect corresponding # multi-modal tokens to be in the prompt text - dummy_inputs = self._get_dummy_processor_inputs( + dummy_inputs = self.profiling_info.get_dummy_processor_inputs( self.ctx.model_config.max_model_len, mm_missing_counts, ) @@ -1133,73 +1125,14 @@ class BaseMultiModalProcessor(ABC): mm_placeholders=mm_placeholder_ranges, ) - def _get_dummy_audios( - self, - *, - length: int, - num_audios: int, - ) -> list[npt.NDArray]: - audio = np.zeros((length, )) - return [audio] * num_audios - - def _get_dummy_images( - self, - *, - width: int, - height: int, - num_images: int, - ) -> list[Image.Image]: - image = Image.new("RGB", (width, height), color=0) - return [image] * num_images - - def _get_dummy_videos( - self, - *, - width: int, - height: int, - num_frames: int, - num_videos: int, - ) -> list[npt.NDArray]: - video = np.zeros((num_frames, width, height, 3)) - return [video] * num_videos - - @abstractmethod - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - """ - Build the multi-modal portion of the input which, after processing, - results in `mm_max_tokens` in :meth:`get_dummy_data`. - """ - raise NotImplementedError - - def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]: - mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt - supported_mm_limits = self.get_supported_mm_limits() - - mm_limits = { - modality: mm_limit_per_prompt.get(modality, 1) - for modality in supported_mm_limits - } - - for modality, supported_limit in supported_mm_limits.items(): - limit = mm_limits[modality] - if supported_limit is not None and supported_limit < limit: - raise ValueError( - f"You set {modality}={limit} (or defaulted to 1) in " - f"`--limit-mm-per-prompt`, but this model only supports " - f"at most {supported_limit} {modality} items.") - - return mm_limits - def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> MultiModalInputsV2: - processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts) + profiling = self.profiling_info + processor_inputs = profiling.get_dummy_processor_inputs( + seq_len, mm_counts) return self.apply( prompt_text=processor_inputs.prompt_text, @@ -1211,8 +1144,9 @@ class BaseMultiModalProcessor(ABC): # Avoid circular import from vllm.sequence import SequenceData - mm_counts = self._get_and_validate_dummy_mm_counts() - mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len) + profiling = self.profiling_info + mm_counts = profiling.get_mm_limits() + mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( "The keys returned by `get_supported_mm_limits`" diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py new file mode 100644 index 0000000000..2ecf0db1a4 --- /dev/null +++ b/vllm/multimodal/profiling.py @@ -0,0 +1,121 @@ +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +import numpy.typing as npt +from PIL import Image + +from vllm.inputs import InputProcessingContext +from vllm.logger import init_logger + +from .inputs import MultiModalDataDict + +logger = init_logger(__name__) + + +@dataclass +class ProcessorInputs: + """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" + prompt_text: str + mm_data: MultiModalDataDict + hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + + +class BaseProfilingInfo(ABC): + """ + Abstract base class that provides the information necessary to profile + multi-modal models. + """ + + def __init__(self, ctx: InputProcessingContext) -> None: + super().__init__() + + self.ctx = ctx + + @abstractmethod + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + """ + Return the maximum supported number of items for each modality. + + A value of `None` means unlimited number of items. + + Omitting a modality from the returned dictionary means that + it is not supported at all. + """ + raise NotImplementedError + + @abstractmethod + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + """ + Get the maximum possible number of tokens per data item + for each modality. + + The dictionary returned by this method should have the same + keys as that returned by :meth:`get_supported_mm_limits`. + """ + raise NotImplementedError + + @abstractmethod + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + """ + Build the multi-modal portion of the input which, after processing, + results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`. + """ + raise NotImplementedError + + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + + def get_mm_limits(self) -> Mapping[str, int]: + mm_config = self.ctx.get_mm_config() + mm_limit_per_prompt = mm_config.limit_per_prompt + + supported_mm_limits = self.get_supported_mm_limits() + + mm_limits = { + modality: mm_limit_per_prompt.get(modality, 1) + for modality in supported_mm_limits + } + + for modality, supported_limit in supported_mm_limits.items(): + limit = mm_limits[modality] + if supported_limit is not None and supported_limit < limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but this model only supports " + f"at most {supported_limit} {modality} items.") + + return mm_limits diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index fb4389dc4d..f75a594a4c 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -224,7 +224,7 @@ class MultiModalRegistry: tokenizer = cached_get_tokenizer(model_config.tokenizer) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len - return processor.get_mm_max_tokens_per_item(seq_len) + return processor.profiling_info.get_mm_max_tokens_per_item(seq_len) return { key: plugin.get_max_multimodal_tokens(model_config) From ee77fdb5de42a6fead2b897d87d99d4b1e5650a9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Jan 2025 21:40:31 +0800 Subject: [PATCH 029/309] [Doc][2/N] Reorganize Models and Usage sections (#11755) Signed-off-by: DarkLight1337 --- .github/ISSUE_TEMPLATE/600-new-model.yml | 2 +- .../disagg_prefill/abstraction.jpg | Bin .../disagg_prefill/overview.jpg | Bin docs/source/contributing/model/basic.md | 102 ++++++++++++ docs/source/contributing/model/index.md | 26 +++ .../model/multimodal.md} | 8 +- .../source/contributing/model/registration.md | 56 +++++++ .../automatic_prefix_caching.md} | 6 +- docs/source/design/kernel/paged_attention.md | 2 + .../dev/offline_inference/offline_index.md | 1 + .../automatic_prefix_caching.md} | 8 +- .../compatibility_matrix.md | 6 +- .../{usage => features}/disagg_prefill.md | 4 +- docs/source/{usage => features}/lora.md | 0 .../{usage => features}/multimodal_inputs.md | 0 .../{ => features}/quantization/auto_awq.md | 0 .../source/{ => features}/quantization/bnb.md | 0 .../source/{ => features}/quantization/fp8.md | 0 .../quantization/fp8_e4m3_kvcache.md | 0 .../quantization/fp8_e5m2_kvcache.md | 0 .../{ => features}/quantization/gguf.md | 0 docs/source/features/quantization/index.md | 19 +++ .../{ => features}/quantization/int8.md | 0 .../quantization/supported_hardware.md | 10 +- .../source/{usage => features}/spec_decode.md | 0 .../{usage => features}/structured_outputs.md | 0 .../{usage => features}/tool_calling.md | 0 docs/source/index.md | 66 +++----- docs/source/models/adding_model.md | 155 ------------------ docs/source/models/supported_models.md | 2 +- .../optimization.md} | 4 +- docs/source/{usage => serving}/engine_args.md | 0 docs/source/{usage => serving}/env_vars.md | 0 .../serving/openai_compatible_server.md | 2 +- docs/source/{usage => serving}/usage_stats.md | 0 vllm/attention/backends/rocm_flash_attn.py | 2 +- vllm/config.py | 6 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/output_processor/multi_step.py | 2 +- vllm/executor/cpu_executor.py | 2 +- vllm/platforms/cpu.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/utils.py | 2 +- vllm/worker/multi_step_model_runner.py | 2 +- vllm/worker/utils.py | 2 +- 45 files changed, 265 insertions(+), 238 deletions(-) rename docs/source/assets/{usage => features}/disagg_prefill/abstraction.jpg (100%) rename docs/source/assets/{usage => features}/disagg_prefill/overview.jpg (100%) create mode 100644 docs/source/contributing/model/basic.md create mode 100644 docs/source/contributing/model/index.md rename docs/source/{models/enabling_multimodal_inputs.md => contributing/model/multimodal.md} (96%) create mode 100644 docs/source/contributing/model/registration.md rename docs/source/{automatic_prefix_caching/details.md => design/automatic_prefix_caching.md} (90%) rename docs/source/{automatic_prefix_caching/apc.md => features/automatic_prefix_caching.md} (97%) rename docs/source/{usage => features}/compatibility_matrix.md (98%) rename docs/source/{usage => features}/disagg_prefill.md (96%) rename docs/source/{usage => features}/lora.md (100%) rename docs/source/{usage => features}/multimodal_inputs.md (100%) rename docs/source/{ => features}/quantization/auto_awq.md (100%) rename docs/source/{ => features}/quantization/bnb.md (100%) rename docs/source/{ => features}/quantization/fp8.md (100%) rename docs/source/{ => features}/quantization/fp8_e4m3_kvcache.md (100%) rename docs/source/{ => features}/quantization/fp8_e5m2_kvcache.md (100%) rename docs/source/{ => features}/quantization/gguf.md (100%) create mode 100644 docs/source/features/quantization/index.md rename docs/source/{ => features}/quantization/int8.md (100%) rename docs/source/{ => features}/quantization/supported_hardware.md (86%) rename docs/source/{usage => features}/spec_decode.md (100%) rename docs/source/{usage => features}/structured_outputs.md (100%) rename docs/source/{usage => features}/tool_calling.md (100%) delete mode 100644 docs/source/models/adding_model.md rename docs/source/{usage/performance.md => performance/optimization.md} (98%) rename docs/source/{usage => serving}/engine_args.md (100%) rename docs/source/{usage => serving}/env_vars.md (100%) rename docs/source/{usage => serving}/usage_stats.md (100%) diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml index 794617a0cf..713e76c1a5 100644 --- a/.github/ISSUE_TEMPLATE/600-new-model.yml +++ b/.github/ISSUE_TEMPLATE/600-new-model.yml @@ -9,7 +9,7 @@ body: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). - #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. + #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model. - type: textarea attributes: label: The model to consider. diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/features/disagg_prefill/abstraction.jpg similarity index 100% rename from docs/source/assets/usage/disagg_prefill/abstraction.jpg rename to docs/source/assets/features/disagg_prefill/abstraction.jpg diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/features/disagg_prefill/overview.jpg similarity index 100% rename from docs/source/assets/usage/disagg_prefill/overview.jpg rename to docs/source/assets/features/disagg_prefill/overview.jpg diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md new file mode 100644 index 0000000000..14690ffe24 --- /dev/null +++ b/docs/source/contributing/model/basic.md @@ -0,0 +1,102 @@ +(new-model-basic)= + +# Basic Implementation + +This guide walks you through the steps to implement a basic vLLM model. + +## 1. Bring your model code + +First, clone the PyTorch model code from the source repository. +For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from +HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. + +```{warning} +Make sure to review and adhere to the original code's copyright and licensing terms! +``` + +## 2. Make your code compatible with vLLM + +To ensure compatibility with vLLM, your model must meet the following requirements: + +### Initialization Code + +All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: + +- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +```python +from torch import nn +from vllm.config import VllmConfig +from vllm.attention import Attention + +class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + +class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + +class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + +class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") +``` + +### Computation Code + +Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +```python +def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + ... +``` + +```{note} +Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. +If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +``` + +For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. + +## 3. (Optional) Implement tensor parallelism and quantization support + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +## 4. Implement the weight loading logic + +You now need to implement the `load_weights` method in your `*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +## 5. Register your model + +See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md new file mode 100644 index 0000000000..a2d601c83c --- /dev/null +++ b/docs/source/contributing/model/index.md @@ -0,0 +1,26 @@ +(new-model)= + +# Adding a New Model + +This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. + +```{toctree} +:caption: Contents +:maxdepth: 1 + +basic +registration +multimodal +``` + +```{note} +The complexity of adding a new model depends heavily on the model's architecture. +The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. +However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. +``` + +```{tip} +If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) +or ask on our [developer slack](https://slack.vllm.ai). +We will be happy to help you out! +``` diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/contributing/model/multimodal.md similarity index 96% rename from docs/source/models/enabling_multimodal_inputs.md rename to docs/source/contributing/model/multimodal.md index fdd7708879..e5dcd1223b 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/contributing/model/multimodal.md @@ -2,15 +2,11 @@ # Enabling Multimodal Inputs -This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). - -```{seealso} -[Adding a New Model](adding-a-new-model) -``` +This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). ## 1. Update the base vLLM model -It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model). +It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). Further update the model as follows: - Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md new file mode 100644 index 0000000000..cf1cdb0c9d --- /dev/null +++ b/docs/source/contributing/model/registration.md @@ -0,0 +1,56 @@ +(new-model-registration)= + +# Model Registration + +vLLM relies on a model registry to determine how to run each model. +A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page. + +If your model is not on this list, you must register it to vLLM. +This page provides detailed instructions on how to do so. + +## Built-in models + +To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source). +This gives you the ability to modify the codebase and test your model. + +After you have implemented your model (see [tutorial](#new-model-basic)), put it into the directory. +Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. +You should also include an example HuggingFace repository for this model in to run the unit tests. +Finally, update the [Supported Models](#supported-models) documentation page to promote your model! + +```{important} +The list of models in each section should be maintained in alphabetical order. +``` + +## Out-of-tree models + +You can load an external model using a plugin without modifying the vLLM codebase. + +```{seealso} +[vLLM's Plugin System](#plugin-system) +``` + +To register the model, use the following code: + +```python +from vllm import ModelRegistry +from your_code import YourModelForCausalLM +ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +from vllm import ModelRegistry + +ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +``` + +```{important} +If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +Read more about that [here](#enabling-multimodal-inputs). +``` + +```{note} +Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. +``` diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/design/automatic_prefix_caching.md similarity index 90% rename from docs/source/automatic_prefix_caching/details.md rename to docs/source/design/automatic_prefix_caching.md index 17f806217a..4398536b2b 100644 --- a/docs/source/automatic_prefix_caching/details.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -1,6 +1,8 @@ -# Implementation +(design-automatic-prefix-caching)= -The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. +# Automatic Prefix Caching + +The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md index c21985b36e..f896f903c7 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/source/design/kernel/paged_attention.md @@ -1,3 +1,5 @@ +(design-paged-attention)= + # vLLM Paged Attention - Currently, vLLM utilizes its own implementation of a multi-head query diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md index 318a02d8c7..c32f99d59e 100644 --- a/docs/source/dev/offline_inference/offline_index.md +++ b/docs/source/dev/offline_inference/offline_index.md @@ -1,6 +1,7 @@ # Offline Inference ```{toctree} +:caption: Contents :maxdepth: 1 llm diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/features/automatic_prefix_caching.md similarity index 97% rename from docs/source/automatic_prefix_caching/apc.md rename to docs/source/features/automatic_prefix_caching.md index c0c141c5fb..3d70cbb29c 100644 --- a/docs/source/automatic_prefix_caching/apc.md +++ b/docs/source/features/automatic_prefix_caching.md @@ -1,13 +1,13 @@ -(apc)= +(automatic-prefix-caching)= -# Introduction +# Automatic Prefix Caching -## What is Automatic Prefix Caching +## Introduction Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. ```{note} -Technical details on how vLLM implements APC are in the next page. +Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). ``` ## Enabling APC in vLLM diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md similarity index 98% rename from docs/source/usage/compatibility_matrix.md rename to docs/source/features/compatibility_matrix.md index 3cefa12ea8..8d8f7dca2e 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar * - Feature - [CP](#chunked-prefill) - - [APC](#apc) + - [APC](#automatic-prefix-caching) - [LoRA](#lora-adapter) - prmpt adptr - [SD](#spec_decode) @@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [APC](#apc) + * - [APC](#automatic-prefix-caching) - ✅ - - @@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - [APC](#apc) + * - [APC](#automatic-prefix-caching) - [✗](gh-issue:3687) - ✅ - ✅ diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/features/disagg_prefill.md similarity index 96% rename from docs/source/usage/disagg_prefill.md rename to docs/source/features/disagg_prefill.md index a61c00fad1..05226f2dec 100644 --- a/docs/source/usage/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -41,13 +41,13 @@ Key abstractions for disaggregated prefilling: Here is a figure illustrating how the above 3 abstractions are organized: -```{image} /assets/usage/disagg_prefill/abstraction.jpg +```{image} /assets/features/disagg_prefill/abstraction.jpg :alt: Disaggregated prefilling abstractions ``` The workflow of disaggregated prefilling is as follows: -```{image} /assets/usage/disagg_prefill/overview.jpg +```{image} /assets/features/disagg_prefill/overview.jpg :alt: Disaggregated prefilling workflow ``` diff --git a/docs/source/usage/lora.md b/docs/source/features/lora.md similarity index 100% rename from docs/source/usage/lora.md rename to docs/source/features/lora.md diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/features/multimodal_inputs.md similarity index 100% rename from docs/source/usage/multimodal_inputs.md rename to docs/source/features/multimodal_inputs.md diff --git a/docs/source/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md similarity index 100% rename from docs/source/quantization/auto_awq.md rename to docs/source/features/quantization/auto_awq.md diff --git a/docs/source/quantization/bnb.md b/docs/source/features/quantization/bnb.md similarity index 100% rename from docs/source/quantization/bnb.md rename to docs/source/features/quantization/bnb.md diff --git a/docs/source/quantization/fp8.md b/docs/source/features/quantization/fp8.md similarity index 100% rename from docs/source/quantization/fp8.md rename to docs/source/features/quantization/fp8.md diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md similarity index 100% rename from docs/source/quantization/fp8_e4m3_kvcache.md rename to docs/source/features/quantization/fp8_e4m3_kvcache.md diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md similarity index 100% rename from docs/source/quantization/fp8_e5m2_kvcache.md rename to docs/source/features/quantization/fp8_e5m2_kvcache.md diff --git a/docs/source/quantization/gguf.md b/docs/source/features/quantization/gguf.md similarity index 100% rename from docs/source/quantization/gguf.md rename to docs/source/features/quantization/gguf.md diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md new file mode 100644 index 0000000000..861cb165c1 --- /dev/null +++ b/docs/source/features/quantization/index.md @@ -0,0 +1,19 @@ +(quantization-index)= + +# Quantization + +Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. + +```{toctree} +:caption: Contents +:maxdepth: 1 + +supported_hardware +auto_awq +bnb +gguf +int8 +fp8 +fp8_e5m2_kvcache +fp8_e4m3_kvcache +``` diff --git a/docs/source/quantization/int8.md b/docs/source/features/quantization/int8.md similarity index 100% rename from docs/source/quantization/int8.md rename to docs/source/features/quantization/int8.md diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md similarity index 86% rename from docs/source/quantization/supported_hardware.md rename to docs/source/features/quantization/supported_hardware.md index 7330c2f8aa..988288a82d 100644 --- a/docs/source/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -1,6 +1,6 @@ -(supported-hardware-for-quantization)= +(quantization-supported-hardware)= -# Supported Hardware for Quantization Kernels +# Supported Hardware The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: @@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations - ✗ ``` -## Notes: - - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - "✅︎" indicates that the quantization method is supported on the specified hardware. - "✗" indicates that the quantization method is not supported on the specified hardware. -Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. +```{note} +This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. +``` diff --git a/docs/source/usage/spec_decode.md b/docs/source/features/spec_decode.md similarity index 100% rename from docs/source/usage/spec_decode.md rename to docs/source/features/spec_decode.md diff --git a/docs/source/usage/structured_outputs.md b/docs/source/features/structured_outputs.md similarity index 100% rename from docs/source/usage/structured_outputs.md rename to docs/source/features/structured_outputs.md diff --git a/docs/source/usage/tool_calling.md b/docs/source/features/tool_calling.md similarity index 100% rename from docs/source/usage/tool_calling.md rename to docs/source/features/tool_calling.md diff --git a/docs/source/index.md b/docs/source/index.md index f390474978..4bc40bf0f5 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -79,6 +79,9 @@ serving/metrics serving/integrations serving/tensorizer serving/runai_model_streamer +serving/engine_args +serving/env_vars +serving/usage_stats ``` ```{toctree} @@ -88,53 +91,28 @@ serving/runai_model_streamer models/supported_models models/generative_models models/pooling_models -models/adding_model -models/enabling_multimodal_inputs ``` ```{toctree} -:caption: Usage +:caption: Features :maxdepth: 1 -usage/lora -usage/multimodal_inputs -usage/tool_calling -usage/structured_outputs -usage/spec_decode -usage/compatibility_matrix -usage/performance -usage/engine_args -usage/env_vars -usage/usage_stats -usage/disagg_prefill -``` - -```{toctree} -:caption: Quantization -:maxdepth: 1 - -quantization/supported_hardware -quantization/auto_awq -quantization/bnb -quantization/gguf -quantization/int8 -quantization/fp8 -quantization/fp8_e5m2_kvcache -quantization/fp8_e4m3_kvcache -``` - -```{toctree} -:caption: Automatic Prefix Caching -:maxdepth: 1 - -automatic_prefix_caching/apc -automatic_prefix_caching/details +features/quantization/index +features/lora +features/multimodal_inputs +features/tool_calling +features/structured_outputs +features/automatic_prefix_caching +features/disagg_prefill +features/spec_decode +features/compatibility_matrix ``` ```{toctree} :caption: Performance :maxdepth: 1 +performance/optimization performance/benchmarks ``` @@ -148,10 +126,8 @@ community/meetups community/sponsors ``` -% API Documentation: API reference aimed at vllm library usage - ```{toctree} -:caption: API Documentation +:caption: API Reference :maxdepth: 2 dev/sampling_params @@ -160,30 +136,32 @@ dev/offline_inference/offline_index dev/engine/engine_index ``` -% Design: docs about vLLM internals +% Design Documents: Details about vLLM internals ```{toctree} -:caption: Design +:caption: Design Documents :maxdepth: 2 design/arch_overview design/huggingface_integration design/plugin_system -design/input_processing/model_inputs_index design/kernel/paged_attention +design/input_processing/model_inputs_index design/multimodal/multimodal_index +design/automatic_prefix_caching design/multiprocessing ``` -% For Developers: contributing to the vLLM project +% Developer Guide: How to contribute to the vLLM project ```{toctree} -:caption: For Developers +:caption: Developer Guide :maxdepth: 2 contributing/overview contributing/profiling/profiling_index contributing/dockerfile/dockerfile +contributing/model/index ``` # Indices and tables diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md deleted file mode 100644 index 02537fba02..0000000000 --- a/docs/source/models/adding_model.md +++ /dev/null @@ -1,155 +0,0 @@ -(adding-a-new-model)= - -# Adding a New Model - -This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. - -```{note} -The complexity of adding a new model depends heavily on the model's architecture. -The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. -However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -``` - -```{note} -By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, -please follow [this guide](#enabling-multimodal-inputs) after implementing the model here. -``` - -```{tip} -If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository. -We will be happy to help you out! -``` - -## 0. Fork the vLLM repository - -Start by forking our [GitHub] repository and then [build it from source](#build-from-source). -This gives you the ability to modify the codebase and test your model. - -```{tip} -If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. -``` - -## 1. Bring your model code - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the directory. -For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. - -```{warning} -When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. -``` - -## 2. Make your code compatible with vLLM - -To ensure compatibility with vLLM, your model must meet the following requirements: - -### Initialization Code - -All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: - -- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. -- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. - -The initialization code should look like this: - -```python -from torch import nn -from vllm.config import VllmConfig -from vllm.attention import Attention - -class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - -class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - -class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - -class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") -``` - -### Computation Code - -Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. - -```python -def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, -) -> torch.Tensor: - ... -``` - -```{note} -Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. -If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -``` - -For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. - -## 3. (Optional) Implement tensor parallelism and quantization support - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -## 4. Implement the weight loading logic - -You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -## 5. Register your model - -Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . - -## 6. Out-of-Tree Model Integration - -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system). - -To register the model, use the following code: - -```python -from vllm import ModelRegistry -from your_code import YourModelForCausalLM -ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -``` - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -```python -from vllm import ModelRegistry - -ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") -``` - -```{important} -If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#enabling-multimodal-inputs). -``` - -```{note} -Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -``` diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 7682ed104b..5a27780261 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -37,7 +37,7 @@ print(output) If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. ```` -Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM. +Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. ### ModelScope diff --git a/docs/source/usage/performance.md b/docs/source/performance/optimization.md similarity index 98% rename from docs/source/usage/performance.md rename to docs/source/performance/optimization.md index 2cd3801bfc..4fcde9b03b 100644 --- a/docs/source/usage/performance.md +++ b/docs/source/performance/optimization.md @@ -1,6 +1,6 @@ -(performance)= +(optimization-and-tuning)= -# Performance and Tuning +# Optimization and Tuning ## Preemption diff --git a/docs/source/usage/engine_args.md b/docs/source/serving/engine_args.md similarity index 100% rename from docs/source/usage/engine_args.md rename to docs/source/serving/engine_args.md diff --git a/docs/source/usage/env_vars.md b/docs/source/serving/env_vars.md similarity index 100% rename from docs/source/usage/env_vars.md rename to docs/source/serving/env_vars.md diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index caf5e8cafd..97e9879075 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -217,7 +217,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. +see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - *Note: `image_url.detail` parameter is not supported.* Code example: diff --git a/docs/source/usage/usage_stats.md b/docs/source/serving/usage_stats.md similarity index 100% rename from docs/source/usage/usage_stats.md rename to docs/source/serving/usage_stats.md diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 480901f710..d43c15b661 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -430,7 +430,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/config.py b/vllm/config.py index b0ed88cb7f..8b824a1fca 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -644,7 +644,7 @@ class ModelConfig: self.use_async_output_proc = False return - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid from vllm.platforms import current_platform if not current_platform.is_async_output_supported(self.enforce_eager): @@ -665,7 +665,7 @@ class ModelConfig: if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -2064,7 +2064,7 @@ class LoRAConfig: model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: logger.warning("LoRA with chunked prefill is still experimental " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 69c7c5077f..e94664308c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1148,7 +1148,7 @@ class EngineArgs: disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 1c6f735f39..c8b282b1a7 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 5495bc50ed..c7f018d9a2 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 09bde9f065..7ba7f51501 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -50,7 +50,7 @@ class CpuPlatform(Platform): import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index de593113b9..e369da1a70 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/usage/compatibility_matrix.md +# Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index aadeddabf8..63057153f8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -58,7 +58,7 @@ logger = init_logger(__name__) # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/usage/compatibility_matrix.md +# Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index dee63a75c0..a2c2cebf8d 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -822,7 +822,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 8f2d343440..ffa8c4cb0f 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: From 9279b9f83dd3aa5bb3d3ce57bf92d9361755d164 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 6 Jan 2025 05:48:53 -0800 Subject: [PATCH 030/309] [Bugfix] Fix max image size for LLaVA-Onevision (#11769) Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 6dccc1e0d3..5eac2f223d 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -170,6 +170,22 @@ class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, BaseLlavaProfilingInfo): + def _get_image_size_with_most_features(self) -> ImageSize: + hf_config = self._get_hf_config() + largest_feature_size, largest_feature_pinpoint = 0, None + for (height, width) in hf_config.image_grid_pinpoints: + feat_size = self._get_num_image_tokens(image_width=width, + image_height=height) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} From 4ca5d40adc53aca2a1fbaed81d9d622fde46ebf1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 6 Jan 2025 21:57:44 +0800 Subject: [PATCH 031/309] [doc] explain how to add interleaving sliding window support (#11771) Signed-off-by: youkaichao --- docs/source/contributing/model/basic.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index 14690ffe24..002808ac5f 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -100,3 +100,16 @@ This method should load the weights from the HuggingFace's checkpoint file and a ## 5. Register your model See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. + +## Frequently Asked Questions + +### How to support models with interleaving sliding windows? + +For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation. + +To support a model with interleaving sliding windows, we need to take care of the following details: + +- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model. +- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). + +With these two steps, interleave sliding windows should work with the model. From 32c9eff2fff8ee91a60c9410c69042dc4c1cc5c8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 6 Jan 2025 23:22:25 +0800 Subject: [PATCH 032/309] [Bugfix][V1] Fix molmo text-only inputs (#11676) Signed-off-by: Jee Jee Li --- .../vision_language/test_models.py | 10 ++ .../vision_language/vlm_utils/model_utils.py | 99 ++++++++++++++++++- vllm/model_executor/models/molmo.py | 56 ++++------- 3 files changed, 123 insertions(+), 42 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index dc0b683c1f..146685738a 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -341,6 +341,16 @@ VLM_TEST_SETTINGS = { ), hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), + "molmo": VLMTestInfo( + models=["allenai/Molmo-7B-D-0924"], + test_type=(VLMTestType.IMAGE), + prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + image_size_factors=[(),(1.0, 1.0, 1.0)], + patch_hf_runner=model_utils.mlomo_patch_hf_runner, + postprocess_inputs=model_utils.molmo_post_processor, + ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 3eca8fb9dc..6c7a753af7 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -5,17 +5,20 @@ typically specific to a small subset of models. import re import types from pathlib import PosixPath -from typing import Callable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from PIL.Image import Image -from transformers import AutoConfig, AutoTokenizer, BatchEncoding +from transformers import (AutoConfig, AutoTokenizer, BatchEncoding, + GenerationConfig) from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from .....conftest import HfRunner, ImageAsset, _ImageAssets +from .....conftest import (HfRunner, ImageAsset, PromptAudioInput, + PromptImageInput, PromptVideoInput, _ImageAssets) +from ....utils import TokensTextLogprobs from .types import RunnerOutput @@ -222,6 +225,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str): return {"model_inputs": hf_inputs} +def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str): + hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype) + return {k: v.unsqueeze(0) for k, v in hf_inputs.items()} + + ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], @@ -451,3 +459,88 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hf_model.model.generate = types.MethodType(_generate, hf_model.model) return hf_model + + +def _generate_greedy_logprobs_limit( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + **kwargs: Any, +) -> List[TokensTextLogprobs]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + # Process in batches for inference. + if len(all_inputs): + input_ids_lst = [] + images_lst = [] + images_input_idx_lst = [] + imges_masks_lst = [] + for inputs in all_inputs: + input_ids_lst.append(inputs["input_ids"]) + images_lst.append(inputs["images"]) + images_input_idx_lst.append(inputs["image_input_idx"]) + imges_masks_lst.append(inputs["image_masks"]) + batch_inputs = {} + batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0) + batch_inputs['images'] = torch.cat(images_lst, dim=0) + batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst, + dim=0) + batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0) + + outputs = self.model.generate_from_batch( + batch=self.wrap_device(batch_inputs, + device=self.model.device.type), + generation_config=GenerationConfig( + max_new_tokens=max_tokens, + stop_strings="<|endoftext|>", + do_sample=False, + ), + tokenizer=self.tokenizer, + output_hidden_states=True, + return_dict_in_generate=True, + ) + + all_logprobs: List[List[Dict[int, float]]] = [] + all_output_ids: List[List[int]] = [] + all_output_strs: List[str] = [] + + for index in range(len(all_inputs)): + ( + seq_logprobs_lst, + output_len, + ) = self._hidden_states_to_logprobs(outputs.hidden_states, + num_logprobs) + all_logprobs.append(seq_logprobs_lst) + seq_ids = outputs.sequences[index] + output_ids = seq_ids[-output_len:] + all_output_ids.append(output_ids.tolist()) + all_output_strs.append(self.tokenizer.decode(output_ids)) + outputs = zip(all_output_ids, all_output_strs, all_logprobs) + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + + +####### Molmo-specific HuggingFace runner patchers +def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for Molmo.""" + hf_processor = hf_model.processor + + def _processor(*args, **kwargs): + return hf_processor.process(*args, **kwargs) + + hf_model.processor = _processor + + setattr( # noqa: B010 + hf_model, + "generate_greedy_logprobs_limit", + types.MethodType(_generate_greedy_logprobs_limit, hf_model), + ) + + return hf_model diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index cc25be9f5b..0e8287bb56 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1081,45 +1081,25 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): else: out = processor.process(None, image, tokens=inputs["prompt_token_ids"]) + # If there is no image, return directly. + if image is None: + new_prompt_token_ids = out["input_ids"].tolist() + prompt = inputs.get("prompt") + if prompt is None: + prompt = tokenizer.decode(new_prompt_token_ids) + return token_inputs( + prompt_token_ids=new_prompt_token_ids, + prompt=prompt, + ) + image_processor = processor.image_processor max_total_crops = 1 + image_processor.max_crops - if image is not None: - images, image_input_idx, image_masks = pad_images( - max_total_crops, - out["images"], - out["image_input_idx"], - out.get("image_masks"), - ) - else: - base_image_input_size = image_processor.base_image_input_size - image_patch_size = image_processor.image_patch_size - image_num_patch = ( - base_image_input_size[0] // image_patch_size, - base_image_input_size[1] // image_patch_size, - ) - n_pixels = image_patch_size * image_patch_size * 3 - n_patches = image_num_patch[0] * image_num_patch[1] - - image_length_w = image_processor.image_token_length_w - image_length_h = image_processor.image_token_length_h - tokens_per_image = image_length_w * image_length_h - images = torch.full( - (max_total_crops, n_patches, n_pixels), - -1, - dtype=torch.float32, - ) - image_input_idx = torch.full( - (max_total_crops, tokens_per_image), - -1, - dtype=torch.int32, - ) - if image_processor.image_padding_mask: - image_masks = torch.full( - (max_total_crops, n_patches), - -1, - dtype=torch.float32, - ) - + images, image_input_idx, image_masks = pad_images( + max_total_crops, + out["images"], + out["image_input_idx"], + out.get("image_masks"), + ) image_data = dict( images=images, image_input_idx=image_input_idx, @@ -1143,11 +1123,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): offset = i size += 1 image_data["image_start_end"] = (offset, offset + size) - prompt = inputs.get("prompt") if prompt is None: prompt = tokenizer.decode(new_prompt_token_ids) - return token_inputs( prompt_token_ids=new_prompt_token_ids, prompt=prompt, From e20c92bb618384ce8d0013e0c9ad273d0c23d65b Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 7 Jan 2025 00:11:28 +0800 Subject: [PATCH 033/309] [Kernel] Move attn_type to Attention.__init__() (#11690) Signed-off-by: Chen Zhang --- tests/kernels/test_encoder_decoder_attn.py | 100 ++++++++++---------- tests/kernels/utils.py | 12 ++- vllm/attention/backends/abstract.py | 2 +- vllm/attention/backends/blocksparse_attn.py | 14 +-- vllm/attention/backends/flash_attn.py | 4 +- vllm/attention/backends/flashinfer.py | 15 ++- vllm/attention/backends/hpu_attn.py | 13 +-- vllm/attention/backends/ipex_attn.py | 12 +-- vllm/attention/backends/pallas.py | 13 +-- vllm/attention/backends/rocm_flash_attn.py | 14 +-- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 6 +- vllm/attention/layer.py | 37 ++------ vllm/model_executor/models/bart.py | 44 +++------ vllm/model_executor/models/bert.py | 10 +- vllm/model_executor/models/mllama.py | 11 +-- vllm/model_executor/models/qwen2.py | 35 ++++--- vllm/v1/attention/backends/flash_attn.py | 14 +-- 18 files changed, 159 insertions(+), 201 deletions(-) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index d943b048b7..6146743757 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -13,8 +13,7 @@ import pytest import torch from tests.kernels.utils import * -from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, - AttentionType) +from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) @@ -64,6 +63,7 @@ class TestPoint(NamedTuple): max_dec_seq_len: int max_enc_seq_len: int num_blocks: int + attn_type: AttentionType class TestResources(NamedTuple): @@ -96,7 +96,6 @@ class TestResources(NamedTuple): ''' scale: float - attn_backend: AttentionBackend attn: Attention kv_cache: torch.Tensor @@ -129,16 +128,17 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: ''' scale = float(1.0 / (test_pt.head_size**0.5)) - attn_backend = make_backend(test_pt.backend_name) attn = Attention( test_pt.num_heads, test_pt.head_size, scale=scale, + prefix=f"{test_pt.attn_type}", + attn_type=test_pt.attn_type, ) if test_pt.num_blocks is None or test_pt.num_heads is None: # Caller does not require a KV cache return TestResources( - scale, attn_backend, attn, + scale, attn, torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) # Construct KV cache @@ -148,7 +148,7 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: test_pt.block_size, device=CUDA_DEVICE, backend=test_pt.backend_name) - return TestResources(scale, attn_backend, attn, kv_cache) + return TestResources(scale, attn, kv_cache) def _encoder_attn_setup( @@ -193,6 +193,7 @@ def _encoder_attn_setup( _, max_q_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -301,6 +302,7 @@ def _decoder_attn_setup( max_q_seq_len, _, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -488,6 +490,7 @@ def _enc_dec_cross_attn_setup_reuses_query( max_decoder_seq_len, max_encoder_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -622,7 +625,6 @@ def _run_encoder_attention_test( & attn_metadata ''' assert attn_metadata.num_decode_tokens == 0 - attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None with set_forward_context(attn_metadata, vllm_config): @@ -635,14 +637,11 @@ def _run_encoder_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + return attn.forward( + reshaped_query, packed_qkv.key, packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), attn_metadata) def _run_decoder_self_attention_test( @@ -675,7 +674,6 @@ def _run_decoder_self_attention_test( * Attention.forward() applied to packed_{query,key,value}, kv_cache & attn_metadata ''' - attn_type = AttentionType.DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv @@ -690,12 +688,8 @@ def _run_decoder_self_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value, + kv_cache, attn_metadata) def _run_encoder_decoder_cross_attention_test( @@ -742,7 +736,6 @@ def _run_encoder_decoder_cross_attention_test( ''' assert decoder_test_params.packed_qkvo.packed_qkv is not None - attn_type = AttentionType.ENCODER_DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache if cross_test_params is None: @@ -762,12 +755,8 @@ def _run_encoder_decoder_cross_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, key, value, kv_cache, + attn_metadata) @pytest.fixture(autouse=True) @@ -839,7 +828,7 @@ def test_encoder_only( # is not part of this test test_pt = TestPoint(num_heads, head_size, attn_backend.name, batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + max_enc_seq_len, 4096, AttentionType.ENCODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init @@ -855,7 +844,7 @@ def test_encoder_only( # Shared prefill metadata structure prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, None, decoder_test_params=None, @@ -961,20 +950,29 @@ def test_e2e_enc_dec_attn( # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test - test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.ENCODER) + enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, + AttentionType.ENCODER_DECODER) + dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.DECODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): - test_rsrcs = _make_test_resources(test_pt) + enc_test_rsrcs = _make_test_resources(enc_test_pt) + enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt) + dec_test_rsrcs = _make_test_resources(dec_test_pt) # Construct encoder attention test params (only used # during prefill) - enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs) + enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs) # Construct Decoder self-attention prefill-phase & decode-phase # test params, including query/key/value tensors, decoder self-attention @@ -987,7 +985,7 @@ def test_e2e_enc_dec_attn( prephase_dec_test_params, decphase_dec_test_params, cross_block_base_addr, - ) = _decoder_attn_setup(test_pt, test_rsrcs) + ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs) # Construct encoder/decoder cross-attention prefill-phase # & decode-phase test params, including key/value tensors, @@ -1000,14 +998,14 @@ def test_e2e_enc_dec_attn( dec_qkv, enc_test_params, prephase_dec_test_params, - test_pt, - test_rsrcs, + enc_dec_test_pt, + enc_dec_test_rsrcs, block_base_addr=cross_block_base_addr) # Shared prefill metadata structure assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens, decoder_test_params=prephase_dec_test_params, @@ -1017,10 +1015,10 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder attention - enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, + enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn, enc_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_test_pt, vllm_config=vllm_config) # - Is encoder attention result correct? @@ -1030,10 +1028,10 @@ def test_e2e_enc_dec_attn( # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, prephase_dec_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is prefill decoder self-attention correct? @@ -1044,11 +1042,11 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is prefill encoder/decoder cross-attention correct? @@ -1059,7 +1057,7 @@ def test_e2e_enc_dec_attn( # DECODE: build decode-phase attention metadata decphase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, False, dec_qkv.q_seq_lens, decoder_test_params=decphase_dec_test_params, @@ -1070,10 +1068,10 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, decphase_dec_test_params, decphase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is decode-phase decoder self-attention correct? @@ -1084,11 +1082,11 @@ def test_e2e_enc_dec_attn( # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is decode-phase encoder/decoder cross-attention correct? diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index e7865fb250..848eea7f54 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,6 +13,7 @@ from torch._prims_common import TensorLikeType from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms.interface import _Backend from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) @@ -790,7 +791,7 @@ def make_block_tables_slot_mapping( def make_test_metadata( - attn_backend: AttentionBackend, + attn_backend: _Backend, is_prompt: bool, seq_lens: Optional[List[int]], decoder_test_params: Optional[PhaseTestParameters], @@ -815,7 +816,7 @@ def make_test_metadata( Arguments: - * attn_backend: Backend for sourcing attention kernels + * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence * decoder_test_params: decoder self-attention test params; @@ -882,6 +883,8 @@ def make_test_metadata( # (kv_mmap) cross_kv_mmap = cross_test_params.kv_mmap + attn_backend_obj = make_backend(attn_backend.name) + if is_prompt: # Prefill-phase scenario @@ -902,8 +905,7 @@ def make_test_metadata( context_lens, encoder_seq_lens, device=device) - - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), multi_modal_placeholder_index_maps=None, @@ -952,7 +954,7 @@ def make_test_metadata( encoder_seq_lens, device=device) - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, multi_modal_placeholder_index_maps=None, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index aed04361e5..f5dcaea79a 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -233,6 +233,7 @@ class AttentionImpl(ABC, Generic[T]): kv_cache_dtype: str = "auto", blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: raise NotImplementedError @@ -246,7 +247,6 @@ class AttentionImpl(ABC, Generic[T]): attn_metadata: T, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 99cb84346d..7089d59392 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -300,6 +300,7 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: assert blocksparse_params is not None assert alibi_slopes is None, ValueError( @@ -350,6 +351,12 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): active_head_range=self.blocksparse_params.active_head_range, ) + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "BlocksparseFlashAttentionImpl") + def forward( self, query: torch.Tensor, @@ -359,7 +366,6 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): attn_metadata: BlocksparseFlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -375,12 +381,6 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "BlocksparseFlashAttentionImpl") - num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index c69e12ad78..23ea244f07 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -600,6 +600,7 @@ class FlashAttentionImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -627,6 +628,7 @@ class FlashAttentionImpl(AttentionImpl): raise ValueError( f"Head size {head_size} is not supported by FlashAttention. " f"Supported head sizes are: {support_head_sizes}.") + self.attn_type = attn_type def forward( self, @@ -637,7 +639,6 @@ class FlashAttentionImpl(AttentionImpl): attn_metadata: FlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -659,6 +660,7 @@ class FlashAttentionImpl(AttentionImpl): assert output is not None, "Output tensor must be provided." + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): raise AttributeError("Encoder attention requires setting " diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index e367468d05..a11462b206 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -748,6 +748,7 @@ class FlashInferImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -764,6 +765,12 @@ class FlashInferImpl(AttentionImpl): assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashInferImpl") + def forward( self, query: torch.Tensor, @@ -773,18 +780,10 @@ class FlashInferImpl(AttentionImpl): attn_metadata: FlashInferMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: # TODO: directly write to output tensor - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashInferImpl") - num_heads: int = self.num_heads head_size: int = self.head_size num_kv_heads: int = self.num_kv_heads diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index f90d15d420..94a461e0c8 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -102,6 +102,7 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, + attn_type: str = AttentionType.DECODER, ) -> None: super(AttentionImpl, self).__init__() self.kv_cache_dtype = kv_cache_dtype @@ -143,6 +144,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "HPUAttentionImpl") + def forward( self, query: torch.Tensor, @@ -152,7 +159,6 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): attn_metadata: HPUAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -166,11 +172,6 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "HPUAttentionImpl") batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 21949874be..da1d307daa 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -115,6 +115,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -146,6 +147,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): raise NotImplementedError( "IPEX backend does not support FP8 KV cache. " "Please use xFormers backend instead.") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "IpexAttnBackendImpl") def split_kv_cache( self, @@ -172,7 +178,6 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): attn_metadata: IpexAttnMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. @@ -189,11 +194,6 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): shape = [num_tokens, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "IpexAttnBackendImpl") num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 9809aed0e6..2ac492dd8a 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -100,6 +100,7 @@ class PallasAttentionBackendImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -141,6 +142,12 @@ class PallasAttentionBackendImpl(AttentionImpl): # megacore mode will be None. self.megacore_mode = "batch" + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "PallasAttentionBackendImpl") + def forward( self, query: torch.Tensor, @@ -150,7 +157,6 @@ class PallasAttentionBackendImpl(AttentionImpl): attn_metadata: PallasMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. @@ -168,11 +174,6 @@ class PallasAttentionBackendImpl(AttentionImpl): shape = [batch_size, seq_len, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "PallasAttentionBackendImpl") batch_size, seq_len, hidden_size = query.shape query = query.view(batch_size, seq_len, self.num_heads, self.head_size) key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index d43c15b661..a91a5af5c3 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -338,6 +338,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -397,6 +398,12 @@ class ROCmFlashAttentionImpl(AttentionImpl): self.attn_func = _sdpa_attention logger.debug("Using naive attention in ROCmBackend") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "ROCmFlashAttentionImpl") + def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: """torch.repeat_interleave(x, dim=1, repeats=n_rep)""" tokens, n_kv_heads, head_dim = x.shape @@ -414,7 +421,6 @@ class ROCmFlashAttentionImpl(AttentionImpl): attn_metadata: ROCmFlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -432,12 +438,6 @@ class ROCmFlashAttentionImpl(AttentionImpl): """ # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "ROCmFlashAttentionImpl") - num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 0cff6f5952..c14f775459 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -390,6 +390,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -421,6 +422,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): raise NotImplementedError( "Torch SDPA backend does not support FP8 KV cache. " "Please use xFormers backend instead.") + self.attn_type = attn_type def forward( self, @@ -431,7 +433,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): attn_metadata: TorchSDPAMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. @@ -448,6 +449,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): shape = [num_tokens, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): raise AttributeError("Encoder attention requires setting " diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 3e59b3603d..694c7cc1bc 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -379,6 +379,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -405,6 +406,8 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") + self.attn_type = attn_type + def forward( self, query: torch.Tensor, @@ -414,7 +417,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): attn_metadata: "XFormersMetadata", k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -468,7 +470,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): Returns: shape = [num_tokens, num_heads * head_size] """ - + attn_type = self.attn_type # Check that appropriate attention metadata attributes are # selected for the desired attention type if (attn_type == AttentionType.ENCODER diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 69b6d1e464..f1b3598e60 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -41,6 +41,7 @@ class Attention(nn.Module): logits_soft_cap: Optional[float] = None, per_layer_sliding_window: Optional[int] = None, prefix: str = "", + attn_type: str = AttentionType.DECODER, ) -> None: super().__init__() if per_layer_sliding_window is not None: @@ -96,7 +97,7 @@ class Attention(nn.Module): impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap) + blocksparse_params, logits_soft_cap, attn_type) self.num_heads = num_heads self.head_size = head_size self.num_kv_heads = num_kv_heads @@ -119,6 +120,7 @@ class Attention(nn.Module): raise ValueError(f"Duplicate layer name: {prefix}") compilation_config.static_forward_context[prefix] = self self.layer_name = prefix + self.attn_type = attn_type def forward( self, @@ -127,18 +129,12 @@ class Attention(nn.Module): value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, - attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: if self.use_direct_call: - return self.impl.forward(query, - key, - value, - kv_cache, - attn_metadata, - self._k_scale, - self._v_scale, - attn_type=attn_type) + return self.impl.forward(query, key, value, kv_cache, + attn_metadata, self._k_scale, + self._v_scale) elif self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) @@ -152,13 +148,11 @@ class Attention(nn.Module): if value is not None: value = value.view(-1, self.num_kv_heads, self.head_size) torch.ops.vllm.unified_attention_with_output( - query, key, value, output, kv_cache, attn_type, - self.layer_name) + query, key, value, output, kv_cache, self.layer_name) return output.view(-1, hidden_size) else: return torch.ops.vllm.unified_attention(query, key, value, - kv_cache, attn_type, - self.layer_name) + kv_cache, self.layer_name) def extra_repr(self) -> str: s = f"head_size={self.impl.head_size}" # type: ignore @@ -237,20 +231,13 @@ def unified_attention( key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> torch.Tensor: forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.dynamic_forward_context self = forward_context.static_forward_context[layer_name] - return self.impl.forward(query, - key, - value, - kv_cache, - attn_metadata, - self._k_scale, - self._v_scale, - attn_type=attn_type) + return self.impl.forward(query, key, value, kv_cache, attn_metadata, + self._k_scale, self._v_scale) def unified_attention_fake( @@ -258,7 +245,6 @@ def unified_attention_fake( key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> torch.Tensor: return torch.empty_like(query).contiguous() @@ -279,7 +265,6 @@ def unified_attention_with_output( value: torch.Tensor, output: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> None: forward_context: ForwardContext = get_forward_context() @@ -292,7 +277,6 @@ def unified_attention_with_output( attn_metadata, self._k_scale, self._v_scale, - attn_type=attn_type, output=output) @@ -302,7 +286,6 @@ def unified_attention_with_output_fake( value: torch.Tensor, output: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> None: return diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 3776490cb3..57eb5adc82 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -71,12 +71,8 @@ class BartLearnedPositionalEmbedding(VocabParallelEmbedding): def forward( self, positions: torch.Tensor, - attn_type: AttentionType, ) -> torch.Tensor: """`input_ids' shape is expected to be [bsz x seqlen].""" - - assert attn_type != AttentionType.ENCODER_DECODER - return super().forward(positions + self.offset) @@ -180,7 +176,8 @@ class BartEncoderAttention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER) def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata) -> torch.Tensor: @@ -189,12 +186,7 @@ class BartEncoderAttention(nn.Module): qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output @@ -264,7 +256,8 @@ class BartDecoderSelfAttention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.DECODER) def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata) -> torch.Tensor: @@ -273,12 +266,7 @@ class BartDecoderSelfAttention(nn.Module): qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.DECODER) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output @@ -348,7 +336,8 @@ class BartCrossAttention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_DECODER) def forward( self, @@ -372,12 +361,7 @@ class BartCrossAttention(nn.Module): _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_DECODER) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output @@ -644,10 +628,7 @@ class BartEncoder(nn.Module): # retrieve input_ids and inputs_embeds inputs_embeds = self.embed_tokens(input_ids) - embed_pos = self.embed_positions( - positions, - AttentionType.ENCODER, - ) + embed_pos = self.embed_positions(positions) embed_pos = embed_pos.to(inputs_embeds.device) hidden_states = inputs_embeds + embed_pos @@ -734,10 +715,7 @@ class BartDecoder(nn.Module): inputs_embeds = self.embed_tokens(decoder_input_ids) # embed positions - embed_pos = self.embed_positions( - decoder_positions, - AttentionType.DECODER, - ) + embed_pos = self.embed_positions(decoder_positions) embed_pos = embed_pos.to(inputs_embeds.device) hidden_states = inputs_embeds + embed_pos diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index c1d47b1bc9..4be136543d 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -238,7 +238,8 @@ class BertSelfAttention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_ONLY) def forward( self, @@ -248,12 +249,7 @@ class BertSelfAttention(nn.Module): ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_ONLY) + output = self.attn(q, k, v, kv_cache, attn_metadata) return output diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 6536f98077..c5046e06ed 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -770,6 +770,7 @@ class MllamaTextCrossAttention(nn.Module): self.scaling, self.num_local_key_value_heads, prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_DECODER, ) def forward( @@ -805,13 +806,9 @@ class MllamaTextCrossAttention(nn.Module): kv_range_for_decode, attn_metadata) else: - output = self.attn(q.view(-1, - self.num_local_heads * self.head_dim), - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_DECODER) + output = self.attn( + q.view(-1, self.num_local_heads * self.head_dim), k, v, + kv_cache, attn_metadata) out, _ = self.o_proj(output) return out diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 88f4ea4352..01745b5fd5 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -107,7 +107,8 @@ class Qwen2Attention(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, rope_scaling: Optional[Tuple] = None, - prefix: str = "") -> None: + prefix: str = "", + attn_type: str = AttentionType.DECODER) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -160,7 +161,8 @@ class Qwen2Attention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=attn_type) def forward( self, @@ -168,17 +170,11 @@ class Qwen2Attention(nn.Module): hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, - attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=attn_type) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.o_proj(attn_output) return output @@ -197,6 +193,16 @@ class Qwen2DecoderLayer(nn.Module): # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) + + # By default, Qwen2 uses causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -207,6 +213,7 @@ class Qwen2DecoderLayer(nn.Module): quant_config=quant_config, rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", + attn_type=attn_type, ) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, @@ -220,15 +227,6 @@ class Qwen2DecoderLayer(nn.Module): self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - # By default, Qwen2 uses causal attention as it is a decoder-only model. - # You can override the HF config with `is_causal=False` to enable - # bidirectional attention, which is used in some embedding models - # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) - if getattr(config, "is_causal", True): - self._attn_type = AttentionType.DECODER - else: - self._attn_type = AttentionType.ENCODER_ONLY - def forward( self, positions: torch.Tensor, @@ -249,7 +247,6 @@ class Qwen2DecoderLayer(nn.Module): hidden_states=hidden_states, kv_cache=kv_cache, attn_metadata=attn_metadata, - attn_type=self._attn_type, ) # Fully Connected diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 65002f1ad7..b02bc9ffde 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -89,6 +89,7 @@ class FlashAttentionImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -119,6 +120,12 @@ class FlashAttentionImpl(AttentionImpl): f"Head size {head_size} is not supported by FlashAttention. " f"Supported head sizes are: {support_head_sizes}.") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashAttentionImpl") + def forward( self, query: torch.Tensor, @@ -128,7 +135,6 @@ class FlashAttentionImpl(AttentionImpl): attn_metadata: FlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -142,12 +148,6 @@ class FlashAttentionImpl(AttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashAttentionImpl") - # NOTE(woosuk): FlashAttention does not support FP8 KV cache. assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") From 91b361ae898c944f823534121613f9d3dc19d7d1 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:58:16 -0800 Subject: [PATCH 034/309] [V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685) Signed-off-by: Roger Wang Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- docs/source/models/supported_models.md | 2 +- tests/multimodal/test_utils.py | 209 +++++++++++++++++- tests/v1/core/test_kv_cache_utils.py | 18 +- tests/v1/core/test_prefix_caching.py | 17 +- vllm/model_executor/models/interfaces.py | 6 +- vllm/model_executor/models/llava_onevision.py | 63 +++--- vllm/model_executor/models/molmo.py | 3 - vllm/multimodal/__init__.py | 3 + vllm/multimodal/hasher.py | 100 +++++++++ vllm/multimodal/inputs.py | 9 +- vllm/multimodal/processing.py | 92 +++----- vllm/multimodal/utils.py | 86 ++++++- vllm/v1/engine/__init__.py | 18 +- vllm/v1/engine/mm_input_mapper.py | 67 ------ vllm/v1/engine/processor.py | 101 ++++++--- vllm/v1/request.py | 48 ++-- vllm/v1/worker/gpu_model_runner.py | 70 +++--- 17 files changed, 633 insertions(+), 279 deletions(-) create mode 100644 vllm/multimodal/hasher.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5a27780261..94a8849f7e 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - ✅︎ * - `MiniCPMV` - MiniCPM-V - T + IE+ diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 6029f2e514..198344e5bd 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,16 +2,22 @@ import base64 import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import Dict, Tuple +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple import numpy as np import pytest from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (MediaConnector, + merge_and_sort_multimodal_metadata, repeat_and_pad_placeholder_tokens) +if TYPE_CHECKING: + from vllm.multimodal.hasher import MultiModalHashDict + from vllm.multimodal.inputs import MultiModalPlaceholderDict + # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model): assert new_prompt == expected_prompt assert new_token_ids == expected_token_ids assert ranges == expected_ranges + + +# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. +class TestCase(NamedTuple): + mm_positions: "MultiModalPlaceholderDict" + mm_hashes: Optional["MultiModalHashDict"] + expected_modalities: list[str] + expected_ranges: list[PlaceholderRange] + expected_hashes: Optional[list[str]] + + +def test_merge_and_sort_multimodal_metadata(): + + test_cases = [ + # Single modality should return result as is but flattened + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ] + }, + mm_hashes={"image": ["hash1", "hash2"]}, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ], + expected_hashes=["hash1", "hash2"], + ), + + # Single modality without hashes return None for mm hash. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ] + }, + mm_hashes=None, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ], + expected_hashes=None, + ), + + # Multiple modalities with hashes should return sorted modalities + # and flattened ranges and hashes. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1", "audio_hash2"], + }, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=[ + "audio_hash1", "audio_hash2", "image_hash1", "image_hash2" + ], + ), + + # Multiple modalities without hashes should return sorted modalities + # and flattened ranges and None. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes=None, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=None, + ), + + # Three modalities + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + ], + "video": [ + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1"], + "video": ["video_hash1", "video_hash2", "video_hash3"] + }, + expected_modalities=["audio", "video", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + expected_hashes=[ + "audio_hash1", "video_hash1", "video_hash2", "video_hash3", + "image_hash1", "image_hash2" + ], + ), + ] + + for (mm_positions, mm_hashes, expected_modalities, expected_ranges, + expected_hashes) in test_cases: + modalities, ranges, hashes = merge_and_sort_multimodal_metadata( + mm_positions, mm_hashes) + + assert modalities == expected_modalities + assert ranges == expected_ranges + assert hashes == expected_hashes + + +def test_merge_and_sort_multimodal_metadata_with_interleaving(): + + test_cases = [ + + #