[VLM] Separate text-only and vision variants of the same model architecture (#13157)

2025-02-13 22:19:15 +08:00
parent 02ed8a1fbe
commit 1bc3b5e71b
15 changed files with 1728 additions and 1642 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
              max_num_seqs=2,
              trust_remote_code=True,
              enforce_eager=True,
+              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>"

@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
    )


-def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-2b"

    llm = LLM(
@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "".join(f"Picture {i}: <img></img>\n"
@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
 model_example_map = {
    "aria": load_aria,
    "deepseek_vl_v2": load_deepseek_vl2,
-    "h2ovl_chat": load_h2onvl,
+    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
    "mllama": load_mllama,