[VLM] Support HF format Phi-4-MM model (#17121)

Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
Isotr0py
2025-07-27 11:07:57 +08:00
committed by GitHub
parent 20950b29fb
commit eed2f463b2
10 changed files with 1847 additions and 5 deletions

View File

@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
)
def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "<|audio|>" * audio_count
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
engine_args = EngineArgs(
model=model_path,
max_model_len=12800,
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
limit_mm_per_prompt={"audio": audio_count},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompts,
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
)
# Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@ -303,6 +334,7 @@ model_example_map = {
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"qwen2_audio": run_qwen2_audio,
"qwen2_5_omni": run_qwen2_5_omni,
"ultravox": run_ultravox,

View File

@ -1097,6 +1097,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
)
# HF format Phi-4-multimodal-instruct
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs.
"""
assert modality == "image"
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
prompts = [
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
]
engine_args = EngineArgs(
model=model_path,
max_model_len=5120,
max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
# Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1356,6 +1391,7 @@ model_example_map = {
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
"phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,

View File

@ -760,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process multi images inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
engine_args = EngineArgs(
model=model_path,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
)
placeholders = "<|image|>" * len(image_urls)
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat"
engine_args = EngineArgs(
@ -988,6 +1022,7 @@ model_example_map = {
"ovis": load_ovis,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"phi4_multimodal": load_phi4_multimodal,
"pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,