[Core] Update dtype detection and defaults (#14858)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-19 13:49:33 +08:00
parent 8b3e94a357
commit f690372b68
22 changed files with 175 additions and 227 deletions
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@ -5,11 +5,10 @@ from typing import Optional
 import numpy as np
 import pytest
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from transformers import AutoModel, AutoTokenizer

 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
@ -107,8 +106,6 @@ def run_test(
    **kwargs,
 ):
    """Inference result should be the same between hf and vllm."""
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@ -124,15 +121,7 @@ def run_test(
            for vllm_prompt, _, audio in prompts_and_audios
        ]

-    def process(hf_inputs: BatchEncoding, **kwargs):
-        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModel) as hf_model:
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@ -122,9 +122,6 @@ VLM_TEST_SETTINGS = {
            "cherry_blossom": "What is in the picture?",
        }),
        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
        dtype="bfloat16",
        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
    #     }),
    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
    #     stop_str=["<|im_end|>"],
    #     image_size_factors=[(0.10, 0.15)],
    #     max_tokens=64,
@ -200,9 +196,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        # For chameleon, we only compare the sequences
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
-        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
@ -258,7 +250,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
-        dtype="bfloat16",
        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
    ),
@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
        }),
        max_model_len=2048,
        max_num_seqs=2,
-        dtype="bfloat16",
        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
        # The image embeddings match with HF but the outputs of the language
@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=8192,
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        num_logprobs=10,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
        }),
        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
        max_model_len=4096,
-        # NOTE: Mono-InternVL-2B doesn't work with fp16,
-        # it will result NaN during inference.
-        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
-        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        num_video_frames=16,
        max_model_len=16384,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values_videos"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
@ -378,9 +360,6 @@ VLM_TEST_SETTINGS = {
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        max_model_len=4096,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        get_stop_token_ids=lambda tok: [128009],
        auto_cls=AutoModelForImageTextToText,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
-        postprocess_inputs=model_utils.wrap_inputs_post_processor,
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
    ),
    "minicpmo_26": VLMTestInfo(
        models=["openbmb/MiniCPM-o-2_6"],
@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
    ),
    "minicpmv_26": VLMTestInfo(
        models=["openbmb/MiniCPM-V-2_6"],
@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        max_num_seqs=2,
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
-        postprocess_inputs=model_utils.molmo_post_processor,
    ),
    # Tests for phi3v currently live in another file because of a bug in
    # transformers. Once this issue is fixed, we can enable them here instead.
@ -482,9 +455,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
        auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[CustomTestOptions(
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union

 import torch
 from PIL.Image import Image
-from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm.config import TaskOption
@ -31,7 +30,6 @@ def run_test(
    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
    auto_cls: type[_BaseAutoModelClass],
    use_tokenizer_eos: bool,
-    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
    comparator: Callable[..., None],
    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
    stop_str: Optional[list[str]],
@ -101,7 +99,6 @@ def run_test(
    hf_model = hf_runner(model,
                         dtype=dtype,
                         auto_cls=auto_cls,
-                         postprocess_inputs=postprocess_inputs,
                         model_kwargs=hf_model_kwargs)

    # Some models need to patch things like the model processor, e.g., internvl
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@ -6,16 +6,15 @@ typically specific to a small subset of models.
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, Optional, Union
+from typing import Optional, Union

 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                          GenerationConfig)

 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from .....conftest import HfRunner, ImageAsset, _ImageAssets
 from .types import RunnerOutput
@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
    return [asset.image_embeds for asset in image_assets]


-####### postprocessors to run on HF BatchEncoding
-def cast_dtype_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which converts a given key into a
-    target data type."""
-
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
-        return hf_inputs
-
-    return process
-
-
-def ignore_inputs_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which ignores a given key."""
-
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        del hf_inputs[hf_inp_key]
-        return hf_inputs
-
-    return process
-
-
-def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    return {"model_inputs": hf_inputs}
-
-
-def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
-    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
-
-
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            for k in inputs.keys()  # noqa
            if k not in ("seq_lens", "sft_format")
        }
-        inputs = BatchEncoding(data=inputs, tensor_type="pt")
-        return inputs
+        return BatchFeature(data=inputs, tensor_type="pt")

    hf_model.processor = processor
    hf_model.model.get_output_embeddings = lambda: \
@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


-def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    orig_generate = hf_model.model.generate

-    def _generate(self, *args, **kwargs):
+    def _generate(
+        self,
+        *args,
+        input_ids=None,
+        pixel_values=None,
+        image_sizes=None,
+        image_bound=None,
+        tgt_sizes=None,
+        **kwargs,
+    ):
+        model_inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+            "image_bound": image_bound,
+            "tgt_sizes": tgt_sizes,
+        }
+        for k in list(model_inputs.keys()):
+            if model_inputs[k] is None:
+                model_inputs.pop(k)
+
+        return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
        return orig_generate(*args, decode_text=False, **kwargs)

    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

    def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
        batch = {
-            k: kwargs.pop(k)
+            k: kwargs.pop(k).unsqueeze(0)
            for k in ("input_ids", "images", "image_input_idx", "image_masks")
            if k in kwargs
        }
+        batch = BatchFeature(batch).to(dtype=self.dtype)

        return self.generate_from_batch(
            batch,
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, BatchEncoding
+from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import identity

 from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
 from ....utils import check_logprobs_close
@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
    # Indicates we should explicitly pass the EOS from the tokenizer
    use_tokenizer_eos: bool = False
    auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
-    # Callable to pass to the HF runner to run on inputs; for now, we also pass
-    # the data type to input post processing, because almost all of the uses of
-    # postprocess_inputs are to fix the data types of BatchEncoding values.
-    postprocess_inputs: Callable[[BatchEncoding, str],
-                                 BatchEncoding] = identity
    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None

    # Post processors that if defined, will run oun the outputs of the
@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
    # is all combinations of .models + all fields below
    max_tokens: Union[int, tuple[int]] = 128
    num_logprobs: Union[int, tuple[int]] = 5
-    dtype: Union[str, Iterable[str]] = "half"
+    dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
    # Only expanded in video tests
    num_video_frames: Union[int, tuple[int]] = 16
@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
            "vllm_output_post_proc": self.vllm_output_post_proc,
            "auto_cls": self.auto_cls,
            "use_tokenizer_eos": self.use_tokenizer_eos,
-            "postprocess_inputs": self.postprocess_inputs,
            "comparator": self.comparator,
            "get_stop_token_ids": self.get_stop_token_ids,
            "hf_model_kwargs": self.hf_model_kwargs,