[VLM] Remove image_input_type from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@ -25,17 +25,11 @@ def iter_llava_configs(model_name: str):
|
||||
}
|
||||
|
||||
for (h, w), f in image_hw_to_feature_size.items():
|
||||
for input_type, input_shape in [
|
||||
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
|
||||
(VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
|
||||
]:
|
||||
yield (model_name,
|
||||
VisionLanguageConfig(image_input_type=input_type,
|
||||
image_feature_size=f,
|
||||
image_token_id=32000,
|
||||
image_input_shape=input_shape,
|
||||
image_processor=model_name,
|
||||
image_processor_revision=None))
|
||||
input_shape = (1, 3, h, w)
|
||||
yield (model_name,
|
||||
VisionLanguageConfig(image_feature_size=f,
|
||||
image_token_id=32000,
|
||||
image_input_shape=input_shape))
|
||||
|
||||
|
||||
model_and_vl_config = [
|
||||
@ -81,8 +75,8 @@ def run_test(
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalData objects and corresponding
|
||||
vision language config as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
@ -104,7 +98,7 @@ def run_test(
|
||||
# NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
|
||||
# we must put it inside the vllm_runner context manager
|
||||
# i.e. after creating vLLM instance.
|
||||
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
|
||||
vllm_images = [asset.for_vllm() for asset in image_assets]
|
||||
|
||||
vllm_image_prompts = [
|
||||
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
|
||||
|
||||
@ -33,16 +33,13 @@ def iter_llava_next_configs(model_name: str):
|
||||
}
|
||||
|
||||
for (h, w), f in image_hw_to_feature_size.items():
|
||||
for input_type, input_shape in [
|
||||
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
|
||||
]:
|
||||
yield (model_name,
|
||||
VisionLanguageConfig(image_input_type=input_type,
|
||||
image_feature_size=f,
|
||||
image_token_id=32000,
|
||||
image_input_shape=input_shape,
|
||||
image_processor=model_name,
|
||||
image_processor_revision=None))
|
||||
input_shape = (1, 3, h, w)
|
||||
yield (model_name,
|
||||
VisionLanguageConfig(
|
||||
image_feature_size=f,
|
||||
image_token_id=32000,
|
||||
image_input_shape=input_shape,
|
||||
))
|
||||
|
||||
|
||||
model_and_vl_config = [
|
||||
@ -85,14 +82,14 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalData objects and corresponding
|
||||
vision language config as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
model_id, vlm_config = model_and_config
|
||||
hf_images = [asset.for_hf() for asset in image_assets]
|
||||
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
|
||||
vllm_images = [asset.for_vllm() for asset in image_assets]
|
||||
|
||||
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
|
||||
|
||||
@ -27,16 +27,11 @@ def iter_phi3v_configs(model_name: str):
|
||||
}
|
||||
|
||||
for (h, w), f in image_hw_to_feature_size.items():
|
||||
for input_type, input_shape in [
|
||||
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
|
||||
]:
|
||||
yield (model_name,
|
||||
VisionLanguageConfig(image_input_type=input_type,
|
||||
image_feature_size=f,
|
||||
image_token_id=32044,
|
||||
image_input_shape=input_shape,
|
||||
image_processor=model_name,
|
||||
image_processor_revision=None))
|
||||
input_shape = (1, 3, h, w)
|
||||
yield (model_name,
|
||||
VisionLanguageConfig(image_feature_size=f,
|
||||
image_token_id=32044,
|
||||
image_input_shape=input_shape))
|
||||
|
||||
|
||||
model_and_vl_config = [
|
||||
@ -89,8 +84,8 @@ def run_test(
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalData objects and corresponding
|
||||
vision language config as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
@ -113,7 +108,7 @@ def run_test(
|
||||
# we must put it inside the vllm_runner context manager
|
||||
# i.e. after creating vLLM instance.
|
||||
|
||||
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
|
||||
vllm_images = [asset.for_vllm() for asset in image_assets]
|
||||
|
||||
vllm_image_prompts = [
|
||||
p.replace("<|image_1|>",
|
||||
|
||||
Reference in New Issue
Block a user