[VLM] Support HF format Phi-4-MM model (#17121)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
@ -760,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process multi images inputs.
|
||||
"""
|
||||
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||
mm_processor_kwargs={"dynamic_hd": 4},
|
||||
)
|
||||
|
||||
placeholders = "<|image|>" * len(image_urls)
|
||||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen-VL-Chat"
|
||||
engine_args = EngineArgs(
|
||||
@ -988,6 +1022,7 @@ model_example_map = {
|
||||
"ovis": load_ovis,
|
||||
"phi3_v": load_phi3v,
|
||||
"phi4_mm": load_phi4mm,
|
||||
"phi4_multimodal": load_phi4_multimodal,
|
||||
"pixtral_hf": load_pixtral_hf,
|
||||
"qwen_vl_chat": load_qwen_vl_chat,
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
|
||||
Reference in New Issue
Block a user