[Bugfix] Fix incorrect qwen2.5-vl attention mask pre-computation (#15200)

Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
Isotr0py
2025-03-21 10:18:04 +08:00
committed by GitHub
parent 2e0b4cfde0
commit 1e508343e1
3 changed files with 37 additions and 4 deletions

View File

@ -508,6 +508,19 @@ VLM_TEST_SETTINGS = {
limit_mm_per_prompt={"image": 4},
)],
),
# regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
limit_mm_per_prompt={"image": 1},
)],
),
}
# yapf: enable

View File

@ -1,7 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
"""Custom input builders for edge-cases in different models."""
from io import BytesIO
from typing import Callable
import requests
from PIL import Image
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
]
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
image = Image.open(BytesIO(requests.get(image_url).content))
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)