[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)
Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@ -18,7 +18,7 @@ from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
# LLaVA-1.5
|
||||
def run_llava(question, modality):
|
||||
def run_llava(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"USER: <image>\n{question}\nASSISTANT:"
|
||||
@ -29,7 +29,7 @@ def run_llava(question, modality):
|
||||
|
||||
|
||||
# LLaVA-1.6/LLaVA-NeXT
|
||||
def run_llava_next(question, modality):
|
||||
def run_llava_next(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"[INST] <image>\n{question} [/INST]"
|
||||
@ -40,7 +40,7 @@ def run_llava_next(question, modality):
|
||||
|
||||
# LlaVA-NeXT-Video
|
||||
# Currently only support for video input
|
||||
def run_llava_next_video(question, modality):
|
||||
def run_llava_next_video(question: str, modality: str):
|
||||
assert modality == "video"
|
||||
|
||||
prompt = f"USER: <video>\n{question} ASSISTANT:"
|
||||
@ -50,7 +50,7 @@ def run_llava_next_video(question, modality):
|
||||
|
||||
|
||||
# LLaVA-OneVision
|
||||
def run_llava_onevision(question, modality):
|
||||
def run_llava_onevision(question: str, modality: str):
|
||||
|
||||
if modality == "video":
|
||||
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
|
||||
@ -67,7 +67,7 @@ def run_llava_onevision(question, modality):
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(question, modality):
|
||||
def run_fuyu(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"{question}\n"
|
||||
@ -77,7 +77,7 @@ def run_fuyu(question, modality):
|
||||
|
||||
|
||||
# Phi-3-Vision
|
||||
def run_phi3v(question, modality):
|
||||
def run_phi3v(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
@ -112,7 +112,7 @@ def run_phi3v(question, modality):
|
||||
|
||||
|
||||
# PaliGemma
|
||||
def run_paligemma(question, modality):
|
||||
def run_paligemma(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma has special prompt format for VQA
|
||||
@ -123,7 +123,7 @@ def run_paligemma(question, modality):
|
||||
|
||||
|
||||
# Chameleon
|
||||
def run_chameleon(question, modality):
|
||||
def run_chameleon(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"{question}<image>"
|
||||
@ -133,7 +133,7 @@ def run_chameleon(question, modality):
|
||||
|
||||
|
||||
# MiniCPM-V
|
||||
def run_minicpmv(question, modality):
|
||||
def run_minicpmv(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# 2.0
|
||||
@ -176,7 +176,7 @@ def run_minicpmv(question, modality):
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(question, modality):
|
||||
def run_internvl(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
@ -203,8 +203,32 @@ def run_internvl(question, modality):
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# NVLM-D
|
||||
def run_nvlm_d(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# BLIP-2
|
||||
def run_blip2(question, modality):
|
||||
def run_blip2(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
|
||||
@ -216,7 +240,7 @@ def run_blip2(question, modality):
|
||||
|
||||
|
||||
# Qwen
|
||||
def run_qwen_vl(question, modality):
|
||||
def run_qwen_vl(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(
|
||||
@ -232,7 +256,7 @@ def run_qwen_vl(question, modality):
|
||||
|
||||
|
||||
# Qwen2-VL
|
||||
def run_qwen2_vl(question, modality):
|
||||
def run_qwen2_vl(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
@ -252,8 +276,8 @@ def run_qwen2_vl(question, modality):
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LLama
|
||||
def run_mllama(question, modality):
|
||||
# LLama 3.2
|
||||
def run_mllama(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
@ -287,6 +311,7 @@ model_example_map = {
|
||||
"minicpmv": run_minicpmv,
|
||||
"blip-2": run_blip2,
|
||||
"internvl_chat": run_internvl,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
"qwen_vl": run_qwen_vl,
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
"mllama": run_mllama,
|
||||
|
||||
@ -144,6 +144,39 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_nvlm_d(question: str, image_urls: List[str]):
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=4,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
|
||||
try:
|
||||
from qwen_vl_utils import process_vision_info
|
||||
@ -204,6 +237,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
|
||||
model_example_map = {
|
||||
"phi3_v": load_phi3v,
|
||||
"internvl_chat": load_internvl,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
"qwen_vl_chat": load_qwenvl_chat,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user