[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Cyrus Leung
2024-10-07 19:55:12 +08:00
committed by GitHub
parent f19da64871
commit 151ef4efd2
12 changed files with 518 additions and 236 deletions

View File

@ -18,7 +18,7 @@ from vllm.utils import FlexibleArgumentParser
# LLaVA-1.5
def run_llava(question, modality):
def run_llava(question: str, modality: str):
assert modality == "image"
prompt = f"USER: <image>\n{question}\nASSISTANT:"
@ -29,7 +29,7 @@ def run_llava(question, modality):
# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(question, modality):
def run_llava_next(question: str, modality: str):
assert modality == "image"
prompt = f"[INST] <image>\n{question} [/INST]"
@ -40,7 +40,7 @@ def run_llava_next(question, modality):
# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question, modality):
def run_llava_next_video(question: str, modality: str):
assert modality == "video"
prompt = f"USER: <video>\n{question} ASSISTANT:"
@ -50,7 +50,7 @@ def run_llava_next_video(question, modality):
# LLaVA-OneVision
def run_llava_onevision(question, modality):
def run_llava_onevision(question: str, modality: str):
if modality == "video":
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
@ -67,7 +67,7 @@ def run_llava_onevision(question, modality):
# Fuyu
def run_fuyu(question, modality):
def run_fuyu(question: str, modality: str):
assert modality == "image"
prompt = f"{question}\n"
@ -77,7 +77,7 @@ def run_fuyu(question, modality):
# Phi-3-Vision
def run_phi3v(question, modality):
def run_phi3v(question: str, modality: str):
assert modality == "image"
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
@ -112,7 +112,7 @@ def run_phi3v(question, modality):
# PaliGemma
def run_paligemma(question, modality):
def run_paligemma(question: str, modality: str):
assert modality == "image"
# PaliGemma has special prompt format for VQA
@ -123,7 +123,7 @@ def run_paligemma(question, modality):
# Chameleon
def run_chameleon(question, modality):
def run_chameleon(question: str, modality: str):
assert modality == "image"
prompt = f"{question}<image>"
@ -133,7 +133,7 @@ def run_chameleon(question, modality):
# MiniCPM-V
def run_minicpmv(question, modality):
def run_minicpmv(question: str, modality: str):
assert modality == "image"
# 2.0
@ -176,7 +176,7 @@ def run_minicpmv(question, modality):
# InternVL
def run_internvl(question, modality):
def run_internvl(question: str, modality: str):
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
@ -203,8 +203,32 @@ def run_internvl(question, modality):
return llm, prompt, stop_token_ids
# NVLM-D
def run_nvlm_d(question: str, modality: str):
assert modality == "image"
model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
tensor_parallel_size=4,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
return llm, prompt, stop_token_ids
# BLIP-2
def run_blip2(question, modality):
def run_blip2(question: str, modality: str):
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
@ -216,7 +240,7 @@ def run_blip2(question, modality):
# Qwen
def run_qwen_vl(question, modality):
def run_qwen_vl(question: str, modality: str):
assert modality == "image"
llm = LLM(
@ -232,7 +256,7 @@ def run_qwen_vl(question, modality):
# Qwen2-VL
def run_qwen2_vl(question, modality):
def run_qwen2_vl(question: str, modality: str):
assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct"
@ -252,8 +276,8 @@ def run_qwen2_vl(question, modality):
return llm, prompt, stop_token_ids
# LLama
def run_mllama(question, modality):
# LLama 3.2
def run_mllama(question: str, modality: str):
assert modality == "image"
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@ -287,6 +311,7 @@ model_example_map = {
"minicpmv": run_minicpmv,
"blip-2": run_blip2,
"internvl_chat": run_internvl,
"NVLM_D": run_nvlm_d,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"mllama": run_mllama,

View File

@ -144,6 +144,39 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
)
def load_nvlm_d(question: str, image_urls: List[str]):
model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
tensor_parallel_size=4,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
@ -204,6 +237,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
model_example_map = {
"phi3_v": load_phi3v,
"internvl_chat": load_internvl,
"NVLM_D": load_nvlm_d,
"qwen2_vl": load_qwen2_vl,
"qwen_vl_chat": load_qwenvl_chat,
}