[VLM] Qwen2.5-VL

This commit is contained in:
Roger Wang
2025-02-05 13:31:38 -08:00
committed by GitHub
parent 9a5b1554b4
commit bf3b79efb8
14 changed files with 1315 additions and 52 deletions

View File

@ -531,6 +531,36 @@ def run_qwen2_vl(question: str, modality: str):
return llm, prompt, stop_token_ids
# Qwen2.5-VL
def run_qwen2_5_vl(question: str, modality: str):
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None
return llm, prompt, stop_token_ids
model_example_map = {
"aria": run_aria,
"blip-2": run_blip2,
@ -557,6 +587,7 @@ model_example_map = {
"pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
}

View File

@ -392,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
)
def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
except ModuleNotFoundError:
print('WARNING: `qwen-vl-utils` not installed, input images will not '
'be automatically resized. You can enable this functionality by '
'`pip install qwen-vl-utils`.')
process_vision_info = None
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
llm = LLM(
model=model_name,
max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls]
else:
image_data, _ = process_vision_info(messages,
return_video_sample_fps=False)
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data,
chat_template=None,
)
model_example_map = {
"aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2,
@ -404,6 +461,7 @@ model_example_map = {
"pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,
}