Remove V0 Encoder-Decoder Support (#24907)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-15 21:17:14 -07:00
parent 5206ab20ba
commit 759ef49b15
47 changed files with 13 additions and 9661 deletions
--- a/examples/offline_inference/dolphin.py
+++ b/examples/offline_inference/dolphin.py
@ -1,311 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import copy
-import os
-from dataclasses import dataclass
-
-import cv2
-import numpy as np
-import regex as re
-from PIL import Image
-from transformers import DonutProcessor
-
-from vllm import LLM, SamplingParams
-from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
-from vllm.multimodal.utils import fetch_image
-
-
-# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
-@dataclass
-class ImageDimensions:
-    original_w: int
-    original_h: int
-    padded_w: int
-    padded_h: int
-
-
-# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
-def map_to_original_coordinates(
-    x1, y1, x2, y2, dims: ImageDimensions
-) -> tuple[int, int, int, int]:
-    try:
-        top = (dims.padded_h - dims.original_h) // 2
-        left = (dims.padded_w - dims.original_w) // 2
-        orig_x1 = max(0, x1 - left)
-        orig_y1 = max(0, y1 - top)
-        orig_x2 = min(dims.original_w, x2 - left)
-        orig_y2 = min(dims.original_h, y2 - top)
-        if orig_x2 <= orig_x1:
-            orig_x2 = min(orig_x1 + 1, dims.original_w)
-        if orig_y2 <= orig_y1:
-            orig_y2 = min(orig_y1 + 1, dims.original_h)
-        return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
-    except Exception as e:
-        print(f"map_to_original_coordinates error: {str(e)}")
-        return 0, 0, min(100, dims.original_w), min(100, dims.original_h)
-
-
-# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
-def adjust_box_edges(image, boxes: list[list[float]], max_pixels=15, threshold=0.2):
-    if isinstance(image, str):
-        image = cv2.imread(image)
-    img_h, img_w = image.shape[:2]
-    new_boxes = []
-    for box in boxes:
-        best_box = copy.deepcopy(box)
-
-        def check_edge(img, current_box, i, is_vertical):
-            edge = current_box[i]
-            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-            _, binary = cv2.threshold(
-                gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
-            )
-            if is_vertical:
-                line = binary[current_box[1] : current_box[3] + 1, edge]
-            else:
-                line = binary[edge, current_box[0] : current_box[2] + 1]
-            transitions = np.abs(np.diff(line))
-            return np.sum(transitions) / len(transitions)
-
-        edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)]
-        current_box = copy.deepcopy(box)
-        current_box[0] = min(max(current_box[0], 0), img_w - 1)
-        current_box[1] = min(max(current_box[1], 0), img_h - 1)
-        current_box[2] = min(max(current_box[2], 0), img_w - 1)
-        current_box[3] = min(max(current_box[3], 0), img_h - 1)
-
-        for i, direction, is_vertical in edges:
-            best_score = check_edge(image, current_box, i, is_vertical)
-            if best_score <= threshold:
-                continue
-            for step in range(max_pixels):
-                current_box[i] += direction
-                if i == 0 or i == 2:
-                    current_box[i] = min(max(current_box[i], 0), img_w - 1)
-                else:
-                    current_box[i] = min(max(current_box[i], 0), img_h - 1)
-                score = check_edge(image, current_box, i, is_vertical)
-                if score < best_score:
-                    best_score = score
-                    best_box = copy.deepcopy(current_box)
-                if score <= threshold:
-                    break
-        new_boxes.append(best_box)
-    return new_boxes
-
-
-# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
-def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
-    try:
-        x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
-        x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
-        x1, y1, x2, y2 = (
-            max(0, min(x1, dims.padded_w - 1)),
-            max(0, min(y1, dims.padded_h - 1)),
-            max(0, min(x2, dims.padded_w)),
-            max(0, min(y2, dims.padded_h)),
-        )
-        if x2 <= x1:
-            x2 = min(x1 + 1, dims.padded_w)
-        if y2 <= y1:
-            y2 = min(y1 + 1, dims.padded_h)
-        new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
-        x1, y1, x2, y2 = new_boxes[0]
-        x1, y1, x2, y2 = (
-            max(0, min(x1, dims.padded_w - 1)),
-            max(0, min(y1, dims.padded_h - 1)),
-            max(0, min(x2, dims.padded_w)),
-            max(0, min(y2, dims.padded_h)),
-        )
-        if x2 <= x1:
-            x2 = min(x1 + 1, dims.padded_w)
-        if y2 <= y1:
-            y2 = min(y1 + 1, dims.padded_h)
-        if previous_box is not None:
-            prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
-            if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1):
-                y1 = prev_y2
-                y1 = min(y1, dims.padded_h - 1)
-                if y2 <= y1:
-                    y2 = min(y1 + 1, dims.padded_h)
-        new_previous_box = [x1, y1, x2, y2]
-        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
-            x1, y1, x2, y2, dims
-        )
-        return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
-    except Exception as e:
-        print(f"process_coordinates error: {str(e)}")
-        orig_x1, orig_y1, orig_x2, orig_y2 = (
-            0,
-            0,
-            min(100, dims.original_w),
-            min(100, dims.original_h),
-        )
-        return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
-
-
-# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
-def prepare_image(image) -> tuple[np.ndarray, ImageDimensions]:
-    try:
-        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-        original_h, original_w = image_cv.shape[:2]
-        max_size = max(original_h, original_w)
-        top = (max_size - original_h) // 2
-        bottom = max_size - original_h - top
-        left = (max_size - original_w) // 2
-        right = max_size - original_w - left
-        padded_image = cv2.copyMakeBorder(
-            image_cv, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)
-        )
-        padded_h, padded_w = padded_image.shape[:2]
-        dimensions = ImageDimensions(
-            original_w=original_w,
-            original_h=original_h,
-            padded_w=padded_w,
-            padded_h=padded_h,
-        )
-        return padded_image, dimensions
-    except Exception as e:
-        print(f"prepare_image error: {str(e)}")
-        h, w = image.height, image.width
-        dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
-        return np.zeros((h, w, 3), dtype=np.uint8), dimensions
-
-
-# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
-def parse_layout_string(bbox_str):
-    """Parse layout string using regular expressions"""
-    pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)"
-    matches = re.finditer(pattern, bbox_str)
-
-    parsed_results = []
-    for match in matches:
-        coords = [float(match.group(i)) for i in range(1, 5)]
-        label = match.group(5).strip()
-        parsed_results.append((coords, label))
-
-    return parsed_results
-
-
-model_id = "ByteDance/Dolphin"
-
-# The input image size for Dolphin is 896 x 896,
-# and the patch_size is 4 x 4.
-# Therefore, the initial number of patches is:
-# Height: 896 / 4 = 224 patches
-# Width: 896 / 4 = 224 patches
-
-# The Dolphin model uses a staged downsampling approach,
-# defined by the "depths": [2, 2, 14, 2] configuration.
-# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
-# which halves the feature map's dimensions (dividing both height and width by 2).
-# Before Stage 2: The size changes from 224 x 224 to (224/2) x (224/2) = 112 x 112.
-# Before Stage 3: The size changes from 112 x 112 to (112/2) x (112/2) = 56 x 56.
-# Before Stage 4: The size changes from 56 x 56 to (56/2) x (56/2) = 28 x 28.
-
-# Because vLLM needs to fill the image features with an encoder_prompt,
-# and the encoder_prompt will have `<pad>` tokens added when tokenized,
-# we need to construct an encoder_prompt with a length of 28 x 28 - 1 = 783.
-encoder_prompt = "".join(["0"] * 783)
-sampling_params = SamplingParams(
-    temperature=0.0,
-    max_tokens=2048,
-)
-
-processor = DonutProcessor.from_pretrained(model_id)
-llm = LLM(
-    model=model_id,
-    dtype="float16",
-    max_num_seqs=8,
-    hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
-)
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--image_path", type=str, default=None, help="Path to a local image file."
-)
-args = parser.parse_args()
-
-if args.image_path:
-    if not os.path.exists(args.image_path):
-        raise FileNotFoundError(f"Error: File not found at {args.image_path}")
-    image = Image.open(args.image_path).convert("RGB")
-else:
-    image = fetch_image(
-        "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
-    )
-
-
-prompt = "Parse the reading order of this document. "
-decoder_prompt = f"<s>{prompt}<Answer/>"
-decoder_prompt_tokens = TokensPrompt(
-    prompt_token_ids=processor.tokenizer(decoder_prompt, add_special_tokens=False)[
-        "input_ids"
-    ]
-)
-enc_dec_prompt = ExplicitEncoderDecoderPrompt(
-    encoder_prompt=TextPrompt(prompt=encoder_prompt, multi_modal_data={"image": image}),
-    decoder_prompt=decoder_prompt_tokens,
-)
-layout_outputs = llm.generate(prompts=enc_dec_prompt, sampling_params=sampling_params)
-layout_result_str = layout_outputs[0].outputs[0].text
-print(f"Layout analysis output:\n{layout_result_str}")
-
-padded_image, dims = prepare_image(image)
-layout_results = parse_layout_string(layout_result_str)
-text_table_elements = []
-previous_box = None
-reading_order = 0
-for bbox_coords, label in layout_results:
-    if label == "fig":
-        continue
-    try:
-        x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = (
-            process_coordinates(bbox_coords, padded_image, dims, previous_box)
-        )
-        cropped = padded_image[y1:y2, x1:x2]
-        if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3:
-            pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
-            prompt_ocr = (
-                "Parse the table in the image. "
-                if label == "tab"
-                else "Read text in the image. "
-            )
-            text_table_elements.append(
-                {
-                    "crop": pil_crop,
-                    "prompt": prompt_ocr,
-                    "reading_order": reading_order,
-                }
-            )
-        reading_order += 1
-    except Exception as e:
-        print(f"Error processing bbox (label: {label}): {str(e)}")
-        continue
-
-if text_table_elements:
-    batch_prompts = []
-    for elem in text_table_elements:
-        decoder_prompt_str = f"<s>{elem['prompt']}<Answer/>"
-        decoder_prompt_tokens = TokensPrompt(
-            prompt_token_ids=processor.tokenizer(
-                decoder_prompt_str, add_special_tokens=False
-            )["input_ids"]
-        )
-        enc_dec_prompt = ExplicitEncoderDecoderPrompt(
-            encoder_prompt=TextPrompt(
-                prompt=encoder_prompt, multi_modal_data={"image": elem["crop"]}
-            ),
-            decoder_prompt=decoder_prompt_tokens,
-        )
-        batch_prompts.append(enc_dec_prompt)
-    batch_outputs = llm.generate(prompts=batch_prompts, sampling_params=sampling_params)
-    for i, output in enumerate(batch_outputs):
-        text_table_elements[i]["text"] = output.outputs[0].text.strip()
-
-print("------" * 8)
-text_table_elements.sort(key=lambda x: x["reading_order"])
-for elem in text_table_elements:
-    print(elem.get("text", ""))
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@ -1,195 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Demonstrate prompting of text-to-text
-encoder/decoder models, specifically BART and mBART.
-
-This script is refactored to allow model selection via command-line arguments.
-
-NOTE: This example is not yet supported in V1.
-"""
-
-import argparse
-from typing import NamedTuple, Optional
-
-from vllm import LLM, SamplingParams
-from vllm.inputs import (
-    ExplicitEncoderDecoderPrompt,
-    TextPrompt,
-    TokensPrompt,
-    zip_enc_dec_prompts,
-)
-
-
-class ModelRequestData(NamedTuple):
-    """
-    Holds the configuration for a specific model, including its
-    HuggingFace ID and the prompts to use for the demo.
-    """
-
-    model_id: str
-    encoder_prompts: list
-    decoder_prompts: list
-    hf_overrides: Optional[dict] = None
-
-
-def get_bart_config() -> ModelRequestData:
-    """
-    Returns the configuration for facebook/bart-large-cnn.
-    This uses the exact test cases from the original script.
-    """
-    encoder_prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "An encoder prompt",
-    ]
-    decoder_prompts = [
-        "A decoder prompt",
-        "Another decoder prompt",
-    ]
-    return ModelRequestData(
-        model_id="facebook/bart-large-cnn",
-        encoder_prompts=encoder_prompts,
-        decoder_prompts=decoder_prompts,
-    )
-
-
-def get_mbart_config() -> ModelRequestData:
-    """
-    Returns the configuration for facebook/mbart-large-en-ro.
-    This uses prompts suitable for an English-to-Romanian translation task.
-    """
-    encoder_prompts = [
-        "The quick brown fox jumps over the lazy dog.",
-        "How are you today?",
-    ]
-    decoder_prompts = ["", ""]
-    hf_overrides = {"architectures": ["MBartForConditionalGeneration"]}
-    return ModelRequestData(
-        model_id="facebook/mbart-large-en-ro",
-        encoder_prompts=encoder_prompts,
-        decoder_prompts=decoder_prompts,
-        hf_overrides=hf_overrides,
-    )
-
-
-MODEL_GETTERS = {
-    "bart": get_bart_config,
-    "mbart": get_mbart_config,
-}
-
-
-def create_all_prompt_types(
-    encoder_prompts_raw: list,
-    decoder_prompts_raw: list,
-    tokenizer,
-) -> list:
-    """
-    Generates a list of diverse prompt types for demonstration.
-    This function is generic and uses the provided raw prompts
-    to create various vLLM input objects.
-    """
-    text_prompt_raw = encoder_prompts_raw[0]
-    text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)])
-    tokens_prompt = TokensPrompt(
-        prompt_token_ids=tokenizer.encode(
-            encoder_prompts_raw[2 % len(encoder_prompts_raw)]
-        )
-    )
-
-    decoder_tokens_prompt = TokensPrompt(
-        prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0])
-    )
-    single_prompt_examples = [
-        text_prompt_raw,
-        text_prompt,
-        tokens_prompt,
-    ]
-    explicit_pair_examples = [
-        ExplicitEncoderDecoderPrompt(
-            encoder_prompt=text_prompt_raw,
-            decoder_prompt=decoder_tokens_prompt,
-        ),
-        ExplicitEncoderDecoderPrompt(
-            encoder_prompt=text_prompt,
-            decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)],
-        ),
-        ExplicitEncoderDecoderPrompt(
-            encoder_prompt=tokens_prompt,
-            decoder_prompt=text_prompt,
-        ),
-    ]
-    zipped_prompt_list = zip_enc_dec_prompts(
-        encoder_prompts_raw,
-        decoder_prompts_raw,
-    )
-    return single_prompt_examples + explicit_pair_examples + zipped_prompt_list
-
-
-def create_sampling_params() -> SamplingParams:
-    """Create a sampling params object."""
-    return SamplingParams(
-        temperature=0,
-        top_p=1.0,
-        min_tokens=0,
-        max_tokens=30,
-    )
-
-
-def print_outputs(outputs: list):
-    """Formats and prints the generation outputs."""
-    print("-" * 80)
-    for i, output in enumerate(outputs):
-        prompt = output.prompt
-        encoder_prompt = output.encoder_prompt
-        generated_text = output.outputs[0].text
-        print(f"Output {i + 1}:")
-        print(f"Encoder Prompt: {encoder_prompt!r}")
-        print(f"Decoder Prompt: {prompt!r}")
-        print(f"Generated Text: {generated_text!r}")
-        print("-" * 80)
-
-
-def main(args):
-    """Main execution function."""
-    model_key = args.model
-    if model_key not in MODEL_GETTERS:
-        raise ValueError(
-            f"Unknown model: {model_key}. "
-            f"Available models: {list(MODEL_GETTERS.keys())}"
-        )
-    config_getter = MODEL_GETTERS[model_key]
-    model_config = config_getter()
-
-    print(f"🚀 Running demo for model: {model_config.model_id}")
-    llm = LLM(
-        model=model_config.model_id,
-        dtype="float",
-        hf_overrides=model_config.hf_overrides,
-    )
-    tokenizer = llm.llm_engine.get_tokenizer_group()
-    prompts = create_all_prompt_types(
-        encoder_prompts_raw=model_config.encoder_prompts,
-        decoder_prompts_raw=model_config.decoder_prompts,
-        tokenizer=tokenizer,
-    )
-    sampling_params = create_sampling_params()
-    outputs = llm.generate(prompts, sampling_params)
-    print_outputs(outputs)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="A flexible demo for vLLM encoder-decoder models."
-    )
-    parser.add_argument(
-        "--model",
-        "-m",
-        type=str,
-        default="bart",
-        choices=MODEL_GETTERS.keys(),
-        help="The short name of the model to run.",
-    )
-    args = parser.parse_args()
-    main(args)
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -13,8 +13,6 @@ from typing import NamedTuple

 from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
-from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser


@ -23,113 +21,6 @@ class ModelRequestData(NamedTuple):
    prompts: Sequence[PromptType]


-def run_donut():
-    engine_args = EngineArgs(
-        model="naver-clova-ix/donut-base-finetuned-docvqa",
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
-        dtype="float16",
-        hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
-    )
-
-    # The input image size for donut-base-finetuned-docvqa is 2560 x 1920,
-    # and the patch_size is 4 x 4.
-    # Therefore, the initial number of patches is:
-    # Height: 1920 / 4 = 480 patches
-    # Width: 2560 / 4 = 640 patches
-    # The Swin model uses a staged downsampling approach,
-    # defined by the "depths": [2, 2, 14, 2] configuration.
-    # Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
-    # which halves the feature map's dimensions (dividing both height and width by 2).
-    # Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320.
-    # Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160.
-    # Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80.
-    # Because vLLM needs to fill the image features with an encoder_prompt,
-    # and the encoder_prompt will have `<pad>` tokens added when tokenized,
-    # we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799.
-    prompts = [
-        {
-            "encoder_prompt": {
-                "prompt": "".join(["$"] * 4799),
-                "multi_modal_data": {
-                    "image": fetch_image(
-                        "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
-                    )  # noqa: E501
-                },
-            },
-            "decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>",  # noqa: E501
-        },
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
-def run_florence2():
-    engine_args = EngineArgs(
-        model="microsoft/Florence-2-large",
-        tokenizer="Isotr0py/Florence-2-tokenizer",
-        max_num_seqs=8,
-        trust_remote_code=True,
-        limit_mm_per_prompt={"image": 1},
-        dtype="half",
-    )
-
-    prompts = [
-        {  # implicit prompt with task token
-            "prompt": "<DETAILED_CAPTION>",
-            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
-        },
-        {  # explicit encoder/decoder prompt
-            "encoder_prompt": {
-                "prompt": "Describe in detail what is shown in the image.",
-                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
-            },
-            "decoder_prompt": "",
-        },
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
-def run_mllama():
-    engine_args = EngineArgs(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_model_len=8192,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
-        dtype="half",
-    )
-
-    prompts = [
-        {  # Implicit prompt
-            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
-            "multi_modal_data": {
-                "image": ImageAsset("stop_sign").pil_image,
-            },
-        },
-        {  # Explicit prompt
-            "encoder_prompt": {
-                "prompt": "<|image|>",
-                "multi_modal_data": {
-                    "image": ImageAsset("stop_sign").pil_image,
-                },
-            },
-            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
-        },
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 def run_whisper():
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

@ -166,9 +57,6 @@ def run_whisper():


 model_example_map = {
-    "donut": run_donut,
-    "florence2": run_florence2,
-    "mllama": run_mllama,
    "whisper": run_whisper,
 }

@ -182,7 +70,7 @@ def parse_args():
        "--model-type",
        "-m",
        type=str,
-        default="mllama",
+        default="whisper",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -204,28 +204,6 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


-# Florence2
-def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
-    engine_args = EngineArgs(
-        model="microsoft/Florence-2-large",
-        tokenizer="Isotr0py/Florence-2-tokenizer",
-        max_model_len=4096,
-        max_num_seqs=2,
-        trust_remote_code=True,
-        dtype="bfloat16",
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # Fuyu
 def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1008,44 +986,6 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    )


-# LLama 3.2
-def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (131072) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [{"type": "image"}, {"type": "text", "text": question}],
-            }
-        ]
-        for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True, tokenize=False
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1665,7 +1605,6 @@ model_example_map = {
    "command_a_vision": run_command_a_vision,
    "deepseek_vl_v2": run_deepseek_vl2,
    "ernie45_vl": run_ernie45_vl,
-    "florence2": run_florence2,
    "fuyu": run_fuyu,
    "gemma3": run_gemma3,
    "gemma3n": run_gemma3n,
@ -1691,7 +1630,6 @@ model_example_map = {
    "minicpmv": run_minicpmv,
    "minimax_vl_01": run_minimax_vl_01,
    "mistral3": run_mistral3,
-    "mllama": run_mllama,
    "molmo": run_molmo,
    "nemotron_vl": run_nemotron_vl,
    "NVLM_D": run_nvlm_d,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -637,26 +637,6 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    )


-def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    img_prompt = "Given the first image <|image|> and the second image<|image|>"
-    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
-    )
-
-
 def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "nvidia/NVLM-D-72B"

@ -1253,7 +1233,6 @@ model_example_map = {
    "llava-next": load_llava_next,
    "llava-onevision": load_llava_onevision,
    "mistral3": load_mistral3,
-    "mllama": load_mllama,
    "NVLM_D": load_nvlm_d,
    "ovis": load_ovis,
    "ovis2_5": load_ovis2_5,