[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)
Signed-off-by: fyabc <suyang.fy@alibaba-inc.com> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Xiong Wang <wangxiongts@163.com>
This commit is contained in:
@ -130,6 +130,36 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Qwen2.5-Omni
|
||||
def run_qwen2_5_omni(question: str, audio_count: int):
|
||||
model_name = "Qwen/Qwen2.5-Omni-7B"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
audio_in_prompt = "".join([
|
||||
"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
|
||||
])
|
||||
|
||||
default_system = (
|
||||
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
|
||||
"Group, capable of perceiving auditory and visual inputs, as well as "
|
||||
"generating text and speech.")
|
||||
|
||||
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_in_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Ultravox 0.5-1B
|
||||
def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
@ -182,6 +212,7 @@ model_example_map = {
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
"qwen2_audio": run_qwen2_audio,
|
||||
"qwen2_5_omni": run_qwen2_5_omni,
|
||||
"ultravox": run_ultravox,
|
||||
"whisper": run_whisper,
|
||||
}
|
||||
|
||||
32
examples/offline_inference/qwen2_5_omni/README.md
Normal file
32
examples/offline_inference/qwen2_5_omni/README.md
Normal file
@ -0,0 +1,32 @@
|
||||
# Qwen2.5-Omni Offline Inference Examples
|
||||
|
||||
This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
|
||||
|
||||
## Thinker Only
|
||||
|
||||
```bash
|
||||
# Audio + image + video
|
||||
python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
|
||||
|
||||
# Read vision and audio inputs from a single video file
|
||||
# NOTE: V1 engine does not support interleaved modalities yet.
|
||||
VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
|
||||
|
||||
# Multiple audios
|
||||
VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
|
||||
```
|
||||
|
||||
This script will run the thinker part of Qwen2.5-Omni, and generate text response.
|
||||
|
||||
You can also test Qwen2.5-Omni on a single modality:
|
||||
|
||||
```bash
|
||||
# Process audio inputs
|
||||
python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
|
||||
|
||||
# Process image inputs
|
||||
python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
|
||||
|
||||
# Process video inputs
|
||||
python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
|
||||
```
|
||||
160
examples/offline_inference/qwen2_5_omni/only_thinker.py
Normal file
160
examples/offline_inference/qwen2_5_omni/only_thinker.py
Normal file
@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference
|
||||
with the correct prompt format on Qwen2.5-Omni (thinker only).
|
||||
"""
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class QueryResult(NamedTuple):
|
||||
inputs: dict
|
||||
limit_mm_per_prompt: dict[str, int]
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
default_system = (
|
||||
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
|
||||
"Group, capable of perceiving auditory and visual inputs, as well as "
|
||||
"generating text and speech.")
|
||||
|
||||
|
||||
def get_mixed_modalities_query() -> QueryResult:
|
||||
question = ("What is recited in the audio? "
|
||||
"What is the content of this image? Why is this video funny?")
|
||||
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
|
||||
"<|vision_bos|><|IMAGE|><|vision_eos|>"
|
||||
"<|vision_bos|><|VIDEO|><|vision_eos|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n")
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"audio":
|
||||
AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
"image":
|
||||
ImageAsset("cherry_blossom").pil_image.convert("RGB"),
|
||||
"video":
|
||||
VideoAsset(name="sample_demo_1.mp4",
|
||||
num_frames=16).np_ndarrays,
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={
|
||||
"audio": 1,
|
||||
"image": 1,
|
||||
"video": 1
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_use_audio_in_video_query() -> QueryResult:
|
||||
question = ("Describe the content of the video, "
|
||||
"then convert what the baby say into text.")
|
||||
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n")
|
||||
asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
|
||||
audio = asset.get_audio(sampling_rate=16000)
|
||||
assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
|
||||
"Please launch this example with "
|
||||
"`VLLM_USE_V1=0`.")
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"video": asset.np_ndarrays,
|
||||
"audio": audio,
|
||||
},
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={
|
||||
"audio": 1,
|
||||
"video": 1
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_multi_audios_query() -> QueryResult:
|
||||
question = "Are these two audio clips the same?"
|
||||
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
|
||||
"<|audio_bos|><|AUDIO|><|audio_eos|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n")
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"audio": [
|
||||
AudioAsset("winning_call").audio_and_sample_rate,
|
||||
AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
],
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={
|
||||
"audio": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
query_map = {
|
||||
"mixed_modalities": get_mixed_modalities_query,
|
||||
"use_audio_in_video": get_use_audio_in_video_query,
|
||||
"multi_audios": get_multi_audios_query,
|
||||
}
|
||||
|
||||
|
||||
def main(args):
|
||||
model_name = "Qwen/Qwen2.5-Omni-7B"
|
||||
query_result = query_map[args.query_type]()
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=5632,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
|
||||
seed=args.seed)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
|
||||
|
||||
outputs = llm.generate(query_result.inputs,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using vLLM for offline inference with '
|
||||
'audio language models')
|
||||
parser.add_argument('--query-type',
|
||||
'-q',
|
||||
type=str,
|
||||
default="mixed_modalities",
|
||||
choices=query_map.keys(),
|
||||
help='Query type.')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -941,6 +941,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Qwen2.5-Omni
|
||||
def run_qwen2_5_omni(questions: list[str], modality: str):
|
||||
model_name = "Qwen/Qwen2.5-Omni-7B"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": [1],
|
||||
},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|IMAGE|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|VIDEO|>"
|
||||
|
||||
default_system = (
|
||||
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
|
||||
"Group, capable of perceiving auditory and visual inputs, as well as "
|
||||
"generating text and speech.")
|
||||
|
||||
prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n") for question in questions]
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# SkyworkR1V
|
||||
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1010,6 +1046,7 @@ model_example_map = {
|
||||
"qwen_vl": run_qwen_vl,
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
"qwen2_5_vl": run_qwen2_5_vl,
|
||||
"qwen2_5_omni": run_qwen2_5_omni,
|
||||
"skywork_chat": run_skyworkr1v,
|
||||
"smolvlm": run_smolvlm,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user