From ba2dfbb0c27d8a8d224e41cebf83cfd6fcfd9293 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 18 Jul 2025 00:13:57 -0700 Subject: [PATCH] [Misc] Make MM embedding merge interface explicit in model runner (#21147) Signed-off-by: Roger Wang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/v1/worker/gpu_model_runner.py | 9 ++++----- vllm/v1/worker/tpu_model_runner.py | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fc7f253888..60fb78c060 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1328,11 +1328,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. input_ids = self.input_ids[:num_scheduled_tokens] - if mm_embeds: - inputs_embeds = self.model.get_input_embeddings( - input_ids, mm_embeds) - else: - inputs_embeds = self.model.get_input_embeddings(input_ids) + inputs_embeds = self.model.get_input_embeddings( + input_ids=input_ids, + multimodal_embeddings=mm_embeds or None, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) inputs_embeds = self.inputs_embeds[:num_input_tokens] diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index ad62d20438..8565df4297 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -937,11 +937,10 @@ class TPUModelRunner(LoRAModelRunnerMixin): # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - if mm_embeds: - inputs_embeds = self.model.get_input_embeddings( - input_ids, mm_embeds) - else: - inputs_embeds = self.model.get_input_embeddings(input_ids) + inputs_embeds = self.model.get_input_embeddings( + input_ids=input_ids, + multimodal_embeddings=mm_embeds, + ) return None, inputs_embeds else: # For text-only models, we use token ids as input.