Remove V0 Encoder-Decoder Support (#24907)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-15 21:17:14 -07:00
parent 5206ab20ba
commit 759ef49b15
47 changed files with 13 additions and 9661 deletions
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@ -1,222 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
-                          HfRunner, VllmRunner)
-from ....utils import multi_gpu_test
-from ...utils import check_logprobs_close
-
-
-def vllm_to_hf_output(
-    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
-    decoder_prompt_type: DecoderPromptType,
-):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    hf_output_str = output_str + "</s>"
-    if decoder_prompt_type == DecoderPromptType.NONE:
-        hf_output_str = "<s>" + hf_output_str
-
-    return output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
-    decoder_prompt_type: DecoderPromptType,
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-) -> None:
-    '''
-    Test the vLLM BART model for a variety of encoder/decoder input prompts,
-    by validating it against HuggingFace (HF) BART.
-
-    Arguments:
-
-    * hf_runner: HuggingFace (HF) test model runner
-    * vllm_runner: vLLM test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
-                                       dictionary of dummy prompts
-    * model: the HF ID of the specific BART variant under test
-    * dtype: the tensor datatype to employ
-    * max_tokens
-    * num_logprobs
-    * decoder_prompt_type: key into the example_encoder_decoder_prompts
-                           dictionary; selects specific encoder/decoder
-                           prompt scenarios to test
-
-    A note on using HF BART as a baseline for validating vLLM BART,
-    specifically when the decoder prompt is None. 
-    
-    The HF GenerationMixin's default behavior is to force the first
-    decoded token to be <BOS> if the prompt does not already contain
-    <BOS> (this is accomplished using a logit
-    processor setting.)
-    
-    So when we use HF BART as our baseline for comparison, note that
-    when the user provides a request with a None decoder prompt
-    (i.e. a singleton encoder prompt, or else an explicit encoder/
-    decoder prompt with the decoder sub-prompt set to None), HF and
-    vLLM handle this in different ways:
-    
-    * HF will (1) tokenize the None prompt as an empty token-list, 
-      (2) append <decoder-start-token> to the beginning, yielding
-      [<decoder-start-token>], (3) pass this token list to the model, and
-      then (4) after computing logits during prefill, override the model
-      logits & force <BOS> to be the first generated token.
-    
-    * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
-      start-token to the beginning, yielding [<decoder-start-token><BOS>],
-      (3) pass these tokens to the model & proceed with generation.
-    
-    The net effect is that compared to vLLM, the list of HF *decoded* tokens
-    will contain one more initial <BOS> than the vLLM generated tokens,
-    because vLLM's <BOS> token is injected into the prompt rather than into
-    the generated output. This is in spite of the fact that overall, the
-    complete sequences (prompt + decoded tokens) produced by vLLM will match
-    HF.
-    
-    So when we use HF decoded token output to validate vLLM's decoded token
-    output, the testing process must account for the difference in decoded
-    token sequences between vLLM and HF specifically in the
-    decoder-prompt-is-None case. 
-    
-    One option is to disable the logit processor feature that forces the
-    <BOS> token to be decoded (forced_bos_token_id = None), eliminating
-    the problem entirely. However this is not "normal" BART usage.
-    
-    The other option is - only in the decoder-prompt-is-None case - to
-    discard the first decoded token from the HF output before comparing it
-    to vLLM.
-
-    To that end, when testing the scenario where the decoder prompt is None
-    (and only in that one scenario), this test skips the first HF decoded
-    token during the process of validating the vLLM decoded output.
-    '''
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default).
-
-    # Note: currently encoder/decoder models are only compatible with
-    # enforce_eager=True. Normally this is not a problem because
-    # for encoder/decoder models vLLM will
-    # default to enforce_eager=True if enforce_eager
-    # is left unspecified. However, the
-    # VllmRunner test fixture (which wraps around the LLM class) defaults to
-    # enforce_eager=False (a behavior which a number of already-existing
-    # decoder-only unit tests expect), so when testing an encoder/decoder
-    # model we must explicitly specify enforce_eager=True in the VllmRunner
-    # constructor.
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            prompts, max_tokens, num_logprobs)
-
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-
-    hf_skip_tokens = (1
-                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=[
-            vllm_to_hf_output(vllm_output, decoder_prompt_type)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-        num_outputs_0_skip_tokens=hf_skip_tokens,
-    )
-
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        pytest.param("facebook/bart-base",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("facebook/bart-large-cnn"),
-    ],
-)
-@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-@pytest.mark.skip(reason="bart not supported in V1")
-def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
-                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        example_encoder_decoder_prompts[decoder_prompt_type],
-        decoder_prompt_type,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
-@pytest.mark.skip(reason="bart not supported in V1")
-def test_models_distributed(hf_runner, vllm_runner,
-                            example_encoder_decoder_prompts,
-                            distributed_executor_backend, model, dtype,
-                            max_tokens, num_logprobs,
-                            decoder_prompt_type) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        example_encoder_decoder_prompts[decoder_prompt_type],
-        decoder_prompt_type,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=2,
-        distributed_executor_backend=distributed_executor_backend,
-    )
--- a/tests/models/language/generation/test_mbart.py
+++ b/tests/models/language/generation/test_mbart.py
@ -1,123 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import DecoderPromptType, HfRunner, VllmRunner
-from ...utils import check_logprobs_close
-
-
-def vllm_to_hf_output(
-    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
-    decoder_prompt_type: DecoderPromptType,
-):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-    hf_output_str = output_str + "</s>"
-    return output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    prompts: list[dict[str, str]],
-    decoder_prompt_type: DecoderPromptType,
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-) -> None:
-    '''
-    Test the vLLM mBART model by validating it against HuggingFace (HF).
-    (Docstring content is omitted for brevity)
-    '''
-
-    vllm_prompts = prompts
-    if decoder_prompt_type == DecoderPromptType.NONE:
-        vllm_prompts = [{
-            "encoder_prompt": p['encoder_prompt'],
-            "decoder_prompt": ""
-        } for p in prompts]
-
-    vllm_kwargs = {
-        "hf_overrides": {
-            "architectures": ["MBartForConditionalGeneration"]
-        }
-    }
-
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     **vllm_kwargs) as vllm_model:  # type: ignore
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            vllm_prompts, max_tokens, num_logprobs)
-
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_kwargs["decoder_start_token_id"] = (
-            hf_model.tokenizer.lang_code_to_id["ro_RO"])
-
-        hf_outputs = (
-            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                prompts,  # HF runner still uses the original prompts
-                max_tokens,
-                num_logprobs,
-                **hf_kwargs,
-            ))
-
-    hf_skip_tokens = 0
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=[
-            vllm_to_hf_output(vllm_output, decoder_prompt_type)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-        num_outputs_0_skip_tokens=hf_skip_tokens,
-    )
-
-
-@pytest.mark.parametrize(
-    "model",
-    [pytest.param("facebook/mbart-large-en-ro")],
-)
-@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
-                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        example_encoder_decoder_prompts[decoder_prompt_type],
-        decoder_prompt_type,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@ -1,147 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import pytest
-from PIL import Image
-
-from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
-from vllm.multimodal.image import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
-from ...utils import check_logprobs_close
-
-MODELS = ["microsoft/Florence-2-base"]
-# Florence-2 model repo's tokenizer config is missing some special tokens.
-# Therefore, we use a converted tokenizer from a forked repo
-TOKENIZER = "Isotr0py/Florence-2-tokenizer"
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<OD>",  # special task token which will output special tokens
-    "cherry_blossom":
-    "Describe in detail what is shown in the image.",
-})
-
-
-def get_hf_images_prompts(
-    prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
-) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
-    prompts, images = [], []
-    for prompt in prompts_:
-        encoder_prompt = prompt["encoder_prompt"]
-        prompts.append(
-            ExplicitEncoderDecoderPrompt(
-                encoder_prompt=encoder_prompt["prompt"],
-                decoder_prompt=None,
-            ))
-        images.append(encoder_prompt["multi_modal_data"]["image"])
-    return prompts, images
-
-
-def hf_to_vllm_output(hf_output: tuple[list[int], str,
-                                       Optional[SampleLogprobs]]):
-    """Sanitize hf output to be comparable with vllm output."""
-    output_ids, output_str, out_logprobs = hf_output
-
-    output_str = output_str.replace("</s>", "").replace("<s>", "")
-
-    return output_ids, output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: list[list[ExplicitEncoderDecoderPrompt]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-) -> None:
-    with vllm_runner(model,
-                     max_num_seqs=8,
-                     tokenizer_name=TOKENIZER,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_case = [
-            vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                skip_special_tokens=False,
-            ) for prompts in inputs
-        ]
-
-    hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
-
-    with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.language_model.lm_head
-        hf_outputs_per_case = [
-            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                prompts, max_tokens, num_logprobs=num_logprobs, images=images)
-            for prompts, images in hf_inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-            num_outputs_0_skip_tokens=1,
-        )
-
-
-# FIXME: https://github.com/huggingface/transformers/issues/38358
-@pytest.mark.skip("Model initialization fails")
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                image_assets: ImageTestAssets, model: str,
-                size_factors: list[int], dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [[
-        ExplicitEncoderDecoderPrompt(
-            encoder_prompt=TextPrompt(
-                prompt=prompt,
-                multi_modal_data={"image": rescale_image_size(image, factor)}),
-            decoder_prompt=None,
-        ) for factor in size_factors
-    ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@ -1,768 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional, overload
-
-import pytest
-import torch
-from packaging.version import Version
-from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
-from transformers import __version__ as TRANSFORMERS_VERSION
-
-from vllm import LLM, SamplingParams
-from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
-                                     global_force_attn_backend_context_manager)
-from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
-from vllm.multimodal.image import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
-                          PromptImageInput, VllmRunner)
-from ....quantization.utils import is_quant_method_supported
-from ....utils import (create_new_process_for_each_test, large_gpu_test,
-                       multi_gpu_test)
-from ...utils import check_logprobs_close
-
-_LIMIT_IMAGE_PER_PROMPT = 3
-MLLAMA_IMAGE_TOKEN_ID = 128256
-
-LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|image|><|begin_of_text|>The meaning of the image is",
-    "cherry_blossom":
-    "<|image|><|begin_of_text|>The city is",
-})
-
-text_only_prompts = [
-    "The color of the sky is blue but sometimes it can also be",
-]
-
-models = [
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
-]
-
-# Indices for inputs
-TEXT_ONLY = '0'
-IMAGE_AT_BEG = '1'
-IMAGE_AT_MIDDLE = '2'
-TWO_IMAGES = '3'
-
-# Input tokenized
-prompt_data = {
-    # Tell me a story
-    TEXT_ONLY: [41551, 757, 264, 3446],
-    # <|image|> What's the content of this image
-    IMAGE_AT_BEG:
-    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
-    # Hello <|image|>What' the content of this image
-    IMAGE_AT_MIDDLE:
-    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
-    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
-    TWO_IMAGES: [
-        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
-        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
-    ]
-}
-
-
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
-    ]
-
-    hf_output_str = output_str
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-def _get_inputs(
-    image_assets: ImageTestAssets,
-    *,
-    size_factors: Optional[list[float]] = None,
-    sizes: Optional[list[tuple[int, int]]] = None,
-) -> list[tuple[list[str], PromptImageInput]]:
-    images = [asset.pil_image for asset in image_assets]
-
-    if size_factors is not None:
-        inputs_per_image = [(
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    elif sizes is not None:
-        inputs_per_image = [(
-            [
-                prompt if size is not None else text_only_prompts[0]
-                for size in sizes
-            ],
-            [
-                image.resize(size) if size is not None else None
-                for size in sizes
-            ],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-        if len(sizes) == 0:
-            inputs_per_image.append(
-                (text_only_prompts, [None] * len(text_only_prompts)))
-    else:
-        raise ValueError("You must provide either `size_factors` or `sizes`")
-
-    return inputs_per_image
-
-
-@overload
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    image_assets: ImageTestAssets,
-    model: str,
-    *,
-    size_factors: list[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-@overload
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    image_assets: ImageTestAssets,
-    model: str,
-    *,
-    sizes: list[tuple[int, int]],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    ...
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    image_assets: ImageTestAssets,
-    model: str,
-    *,
-    size_factors: Optional[list[float]] = None,
-    sizes: Optional[list[tuple[int, int]]] = None,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    _run_test(
-        hf_runner,
-        vllm_runner,
-        _get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
-
-
-def _run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_model_len=19212,  # 3 max size images
-            max_num_seqs=3,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            limit_mm_per_prompt={"image":
-                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   model_kwargs={"device_map": "auto"},
-                   auto_cls=AutoModelForImageTextToText) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.fixture(autouse=True)
-def clear_cache():
-    """Fixture to clear backend cache before each test."""
-    _cached_get_attn_backend.cache_clear()  # Clear the cache
-    yield  # This allows the test to run
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "sizes",
-    [
-        # Text only
-        [],
-        # Single-size
-        [(512, 512)],
-        # Single-size, batched
-        [(512, 512), (512, 512), (512, 512)],
-        # Multi-size, batched
-        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
-         (1024, 1024), (512, 1536), (512, 2028)],
-        # Multi-size, batched, including text only
-        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
-         (1024, 1024), (512, 1536), (512, 2028), None],
-        # mllama has 8 possible aspect ratios, carefully set the sizes
-        # to cover all of them
-    ])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
-    reason="Transformers v4.55 has a regression issue on mllama, "
-    "see: https://github.com/huggingface/transformers/pull/40083")
-def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
-                                     model, sizes, dtype, max_tokens,
-                                     num_logprobs,
-                                     attn_backend: _Backend) -> None:
-    with global_force_attn_backend_context_manager(attn_backend):
-        if attn_backend == _Backend.FLASH_ATTN:
-            # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
-        run_test(
-            hf_runner,
-            vllm_runner,
-            image_assets,
-            model,
-            sizes=sizes,
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            tensor_parallel_size=1,
-        )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
-    reason="Transformers v4.55 has a regression issue on mllama, "
-    "see: https://github.com/huggingface/transformers/pull/40083")
-def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
-                                     model, dtype, max_tokens, num_logprobs,
-                                     attn_backend: _Backend) -> None:
-
-    stop_sign = image_assets[0].pil_image
-    cherry_blossom = image_assets[1].pil_image
-
-    inputs = [(
-        [
-            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
-            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
-            "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.",  # noqa: E501
-        ],
-        [
-            [stop_sign, cherry_blossom],
-            # Images with different sizes.
-            [
-                stop_sign.resize((512, 512)),
-                stop_sign,
-            ],
-            [
-                stop_sign,
-                stop_sign.resize((512, 1536)),
-                cherry_blossom.resize((512, 1024)),
-            ],
-        ])]
-    with global_force_attn_backend_context_manager(attn_backend):
-        if attn_backend == _Backend.FLASH_ATTN:
-            # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
-        _run_test(
-            hf_runner,
-            vllm_runner,
-            inputs,
-            model,
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            tensor_parallel_size=1,
-        )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
-    reason="Transformers v4.55 has a regression issue on mllama, "
-    "see: https://github.com/huggingface/transformers/pull/40083")
-def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
-                                   dtype, max_tokens, num_logprobs,
-                                   attn_backend: _Backend) -> None:
-
-    stop_sign = image_assets[0].pil_image
-    cherry_blossom = image_assets[1].pil_image
-
-    inputs = [(
-        [
-            "<|begin_of_text|>The content of the image <|image|> is",  # noqa: E501
-            "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "  # noqa: E501
-            "which is a stop sign and which is a cherry blossom?",  # noqa: E501
-        ],
-        [
-            [stop_sign],
-            [stop_sign, cherry_blossom],
-        ])]
-    with global_force_attn_backend_context_manager(attn_backend):
-        if attn_backend == _Backend.FLASH_ATTN:
-            # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
-        _run_test(
-            hf_runner,
-            vllm_runner,
-            inputs,
-            model,
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            tensor_parallel_size=1,
-        )
-
-
-@create_new_process_for_each_test()
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
-    reason="Transformers v4.55 has a regression issue on mllama, "
-    "see: https://github.com/huggingface/transformers/pull/40083")
-def test_models_distributed(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    distributed_executor_backend,
-    model,
-    dtype,
-    max_tokens,
-    num_logprobs,
-) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=model,
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=2,
-        distributed_executor_backend=distributed_executor_backend,
-    )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["float16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-def test_bnb_regression(
-    image_assets: ImageTestAssets,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-):
-    stop_sign = image_assets[0].pil_image
-    prompts = [
-        {
-            "prompt": "<|begin_of_text|>The content of the image <|image|> is",
-            "multi_modal_data": {
-                "image": stop_sign
-            },
-        },
-        {
-            "prompt":
-            "The color of the sky is blue but sometimes it can also be",
-        },
-    ]
-    # Test regression about QKVCrossParallelLinear
-    llm = LLM(
-        model=model,
-        dtype=dtype,
-        max_model_len=8192,
-        max_num_seqs=2,
-        quantization="bitsandbytes",
-    )
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=max_tokens,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-    assert outputs
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-def test_explicit_implicit_prompt(
-    image_assets: ImageTestAssets,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-):
-    stop_sign = image_assets[0].pil_image
-    # yapf: disable
-    prompts = [
-        # explicit prompt
-        {
-            "encoder_prompt": {
-                "prompt": "<|image|>",
-                "multi_modal_data": {"image": stop_sign},
-            },
-            "decoder_prompt": {
-                "prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374],  # noqa: E501
-            }
-        },
-        {
-            "encoder_prompt": "Not <|image|>",
-            "decoder_prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
-        },
-        # implicit prompt
-        {
-            "prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
-            "multi_modal_data": {"image": stop_sign},
-        },
-        {
-            "prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
-        },
-    ]
-    # yapf: enable
-    llm = LLM(
-        model=model,
-        dtype=dtype,
-        max_model_len=8192,
-        max_num_seqs=2,
-        tensor_parallel_size=1,
-    )
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=max_tokens,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-    n_prompts = len(prompts)
-    explicit_outputs = outputs[:n_prompts // 2]
-    implicit_outputs = outputs[n_prompts // 2:]
-    for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
-        assert exp_output.outputs[0].text == imp_output.outputs[0].text
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
-                    num_logprobs, attn_backend: _Backend) -> None:
-
-    stop_sign = image_assets[0].pil_image
-
-    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
-            model,
-            dtype=dtype,
-            max_model_len=8192,
-            max_num_seqs=4,
-            tensor_parallel_size=1,
-            limit_mm_per_prompt={"image":
-                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
-
-        # Regression tests for https://github.com/vllm-project/vllm/issues/10648
-
-        # Number of groups of image tokens is greater than the number of images
-        # provided (the whitespace between the tags is necessary)
-        prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images"  # noqa: E501
-        image = stop_sign
-        with pytest.raises(ValueError):
-            vllm_model.generate_greedy_logprobs([prompt],
-                                                max_tokens,
-                                                num_logprobs,
-                                                images=[image])
-
-        # Batch of a text-only and image request that requires cross-attention
-        prompts = [
-            "What is the capital of spain?",
-            "Text before the image...<|image|>What is in the image?",  # noqa: E501
-        ]
-        images = [
-            None,
-            [stop_sign],
-        ]
-        vllm_model.generate_greedy_logprobs(prompts,
-                                            max_tokens,
-                                            num_logprobs,
-                                            images=images)
-
-        # Test the reverse order too for good measure
-        prompts = [
-            "<|begin_of_text|>Text before the image...<|image|>What is in the image?",  # noqa: E501
-            "<|begin_of_text|>Hello!",
-        ]
-        images = [
-            [stop_sign],
-            None,
-        ]
-        vllm_model.generate_greedy_logprobs(prompts,
-                                            max_tokens,
-                                            num_logprobs,
-                                            images=images)
-
-        # Mixed batch with text and images with different numbers of tiles
-        prompts = [
-            "<|begin_of_text|>Hello!",
-            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
-            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
-        ]
-        images = [
-            None,
-            [stop_sign],
-            # smaller image must be 2nd for the repro
-            [stop_sign.resize((448, 448))],
-        ]
-        vllm_model.generate_greedy_logprobs(prompts,
-                                            max_tokens,
-                                            num_logprobs,
-                                            images=images)
-
-
-class DummyModel:
-    image_token_id = MLLAMA_IMAGE_TOKEN_ID
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize(
-    "input_indices_and_output",
-    # inputs, (cross_attention_mask, kv_range_for_decode)
-    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
-     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
-     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
-     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
-     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
-      ((23, 24), [[0, 6], [6, 12]])),
-     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
-     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
-     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
-def test_get_cross_attention_mask(input_indices_and_output) -> None:
-
-    input_indices, expected_output = input_indices_and_output
-
-    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
-    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
-                 if i != TEXT_ONLY]
-    input = torch.cat(sequences)
-
-    seq_lens = [len(s) for s in sequences]
-
-    attn_data = FlashAttentionMetadata(
-        seq_lens=seq_lens,
-        # Dummy values
-        enable_kv_scales_calculation=False,
-        num_prefills=0,
-        num_prefill_tokens=0,
-        num_decode_tokens=0,
-        slot_mapping=0,
-        multi_modal_placeholder_index_maps=None,
-        seq_lens_tensor=0,
-        max_prefill_seq_len=0,
-        max_decode_seq_len=0,
-        context_lens_tensor=None,
-        block_tables=None,
-        use_cuda_graph=False,
-    )
-
-    dummy = DummyModel()
-
-    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
-        .get_cross_attention_mask(dummy,
-                                  input,
-                                  attn_data,
-                                  num_tiles=num_tiles,
-                                  num_tokens_per_tile=3,
-                                  dtype=torch.bfloat16)
-
-    expected_cross_attention_mask, expected_kv_range_for_decode = \
-        expected_output
-
-    assert kv_range_for_decode == expected_kv_range_for_decode
-    if expected_cross_attention_mask is not None:
-        assert cross_attention_mask is not None
-        assert cross_attention_mask.shape == expected_cross_attention_mask
-    else:
-        assert cross_attention_mask is None
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize(
-    "input_indices",
-    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
-     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
-     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
-def test_get_full_text_row_masked_out_mask(input_indices) -> None:
-
-    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
-
-    seq_lens = [len(s) for s in sequences]
-
-    num_prefill_tokens = sum(seq_lens)
-
-    # TEXT_ONLY is zero, so it will be masked out,
-    # other instances should not be.
-    encoder_seq_lens = [int(i) for i in input_indices]
-
-    attn_data = FlashAttentionMetadata(
-        seq_lens=seq_lens,
-        encoder_seq_lens=encoder_seq_lens,
-        num_prefill_tokens=num_prefill_tokens,
-        # Dummy values
-        enable_kv_scales_calculation=False,
-        num_prefills=0,
-        num_decode_tokens=0,
-        slot_mapping=0,
-        multi_modal_placeholder_index_maps=None,
-        seq_lens_tensor=0,
-        max_prefill_seq_len=0,
-        max_decode_seq_len=0,
-        context_lens_tensor=None,
-        block_tables=None,
-        use_cuda_graph=False,
-    )
-
-    dummy = DummyModel()
-
-    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
-        .get_full_text_row_masked_out_mask(dummy,
-                                  attn_data,
-                                  torch.get_default_device())
-
-    full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
-    full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
-
-    idx = 0
-    assert len(full_text_row_masked_out_mask) == num_prefill_tokens
-    for i, seq_len in enumerate(seq_lens):
-        must_be_masked = input_indices[i] != TEXT_ONLY
-        for _ in range(seq_len):
-            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
-                f"full_text_row_masked_out_mask[{idx}] must be " \
-                f"'{must_be_masked}' "
-            idx += 1
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
-    ([6404], [[4]], [6404]),
-    ([0, 6404], [[4]], [6404]),
-    ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
-    ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
-])
-def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
-                                         expected) -> None:
-
-    dummy = DummyModel()
-    num_tokens_per_tile = 1601
-    actual_encoder_seq_lens = MllamaForConditionalGeneration \
-        ._get_and_validate_encoder_lens(
-            dummy,
-            encoder_seq_lens,
-            num_tiles,
-            num_tokens_per_tile,
-        )
-    assert actual_encoder_seq_lens == expected, \
-        f"Expected {expected} but got {actual_encoder_seq_lens}"
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -167,8 +167,6 @@ def _test_processing_correctness(
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
-    "donut": False,
-    "mllama": False,
    "ovis": False,
    "ovis2_5": False,
    "paligemma": False,
@ -278,9 +276,7 @@ def _test_processing_correctness_one(
    "facebook/chameleon-7b",
    "CohereLabs/command-a-vision-07-2025",
    "deepseek-ai/deepseek-vl2-tiny",
-    "naver-clova-ix/donut-base-finetuned-docvqa",
    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
-    "microsoft/Florence-2-base",
    "adept/fuyu-8b",
    "google/gemma-3-4b-it",
    "google/gemma-3n-E2B-it",
@ -305,7 +301,6 @@ def _test_processing_correctness_one(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "mispeech/midashenglm-7b",
    "openbmb/MiniCPM-Llama3-V-2_5",
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@ -1,72 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for mllama's multimodal preprocessing and profiling."""
-import pytest
-from transformers import MllamaConfig
-
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.profiling import MultiModalProfiler
-
-from ...utils import build_model_context
-
-
-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
-@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
-@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
-def test_profiling(
-    model_id: str,
-    max_model_len: int,
-    max_num_seqs: int,
-):
-    # regression test for https://github.com/vllm-project/vllm/issues/13929
-    from vllm.model_executor.models.mllama import calc_token_per_chunk
-
-    model_config_kwargs = {
-        "max_model_len": max_model_len,
-    }
-    ctx = build_model_context(
-        model_id,
-        model_config_kwargs=model_config_kwargs,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_config = ctx.get_mm_config()
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
-    profiler = MultiModalProfiler(processor)
-
-    dummy_encoder_data = profiler.get_encoder_dummy_data(
-        max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
-    )
-    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
-        max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
-    )
-
-    hf_config = ctx.get_hf_config(MllamaConfig)
-    image_size = hf_config.vision_config.image_size
-    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
-                        ] * max_num_seqs
-
-    mm_data = processor.apply(
-        prompt=dummy_mm_data.prompt,
-        mm_data=dummy_mm_data.mm_data,
-        hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"].get_data()
-
-    # Get the actual number of encoder tokens for each sample.
-    # Because attn_metadata.encoder_seq_lens only counts the last
-    # group of images for each sample, which is used to cheat the
-    # block manager to allocate blocks for those images only.
-    # See MllamaMultiModalProcessor for more details.
-    num_tiles = [[t] for t in mm_data.pop("num_tiles")]
-    num_tokens_per_tile = calc_token_per_chunk(image_size)
-    actual_encoder_seq_lens = [
-        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
-    ]
-
-    # simulate mllama image-present prefill.
-    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
-                                          encoder_seq_lens):
-        assert actual_len >= last_group_len
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides

 ARCH_TO_SKIP = {
    "MolmoForCausalLM": "incompatible requirements",
-    "Florence2ForConditionalGeneration": "not supported in V1",
 }
 ARCH_NEEDS_EXTRAS = [
    "InternVLChatModel",
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -354,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                        trust_remote_code=True),
    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
-    # [Encoder-decoder]
-    "BartModel": _HfExamplesInfo("facebook/bart-base"),
-    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
-    "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro",  # noqa: E501
-                                                    hf_overrides={"architectures": ["MBartForConditionalGeneration"]}),  # noqa: E501
 }

 _EMBEDDING_EXAMPLE_MODELS = {
@ -496,7 +491,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                      trust_remote_code=True),
    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                      max_model_len=10240,
-                                                      extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501
+                                                      extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"},  # noqa: E501
                                                      ),
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
@ -583,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
-    "DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa",  # noqa: E501
-                                                    hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"},  # noqa: E501
-                                                    extras={"dolphin": "ByteDance/Dolphin"}),  # noqa: E501
-    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-    # Therefore, we borrow the BartTokenizer from the original Bart model
-    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
-                                                         trust_remote_code=True),  # noqa: E501
-    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),   # noqa: E501
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
            # L4 supports FA3.
            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
-        if model_arch == "Florence2ForConditionalGeneration":
-            # An encoder-decoder model that's V0-only. Just skip it
-            # since V0 is about to be removed.
-            pytest.skip("Skipping Florence2ForConditionalGeneration")
        if model_arch == "WhisperForConditionalGeneration":
            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
        LLM(
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@ -50,7 +50,6 @@ def test_registry_imports(model_arch):
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
    ("LlamaForCausalLM", False, False, False),
-    ("MllamaForConditionalGeneration", True, False, False),
    ("LlavaForConditionalGeneration", True, True, False),
    ("BertForSequenceClassification", False, False, True),
    ("RobertaForSequenceClassification", False, False, True),