[MODEL ADDITION] Ovis2 Model Addition (#15826)
Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@ -467,6 +467,18 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
patch_hf_runner=model_utils.molmo_patch_hf_runner,
|
||||
),
|
||||
"ovis2": VLMTestInfo(
|
||||
models=["AIDC-AI/Ovis2-1B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
|
||||
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
|
||||
patch_hf_runner=model_utils.ovis2_patch_hf_runner,
|
||||
),
|
||||
"phi3v": VLMTestInfo(
|
||||
models=["microsoft/Phi-3.5-vision-instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
|
||||
@ -67,7 +67,7 @@ def run_test(
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
}
|
||||
if model_info.tokenizer:
|
||||
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
|
||||
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
|
||||
if model_info.tokenizer_mode:
|
||||
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
|
||||
if model_info.hf_overrides:
|
||||
|
||||
@ -676,3 +676,33 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.visual_tokenizer.to(hf_model.dtype)
|
||||
hf_model.model.vte.to(hf_model.dtype)
|
||||
hf_model.model.llm.to(hf_model.dtype)
|
||||
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
text_tokenizer = hf_model.model.get_text_tokenizer()
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
|
||||
text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0]
|
||||
|
||||
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
|
||||
text_or_conversations=text, images=images)
|
||||
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
|
||||
|
||||
inputs = {
|
||||
"inputs": input_ids.unsqueeze(0),
|
||||
"pixel_values": pixel_values.unsqueeze(0),
|
||||
"attention_mask": attention_mask.unsqueeze(0),
|
||||
}
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
|
||||
Reference in New Issue
Block a user