[V1] Remove V0 code paths for Hybrid models (#25400)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-09-23 17:26:13 +02:00
parent 2c58742dff
commit a903669e10
31 changed files with 352 additions and 2296 deletions
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -20,7 +20,9 @@ pytestmark = pytest.mark.hybrid_model
 SSM_MODELS = [
    "state-spaces/mamba-130m-hf",
    "tiiuae/falcon-mamba-tiny-dev",
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
+    # mamba2-codestral in transformers is broken pending:
+    # https://github.com/huggingface/transformers/pull/40861
+    #"yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]

 HYBRID_MODELS = [
@ -31,18 +33,7 @@ HYBRID_MODELS = [
    "ibm-granite/granite-4.0-tiny-preview",
    "tiiuae/Falcon-H1-0.5B-Base",
    "LiquidAI/LFM2-1.2B",
-]
-
-V1_SUPPORTED_MODELS = [
-    "state-spaces/mamba-130m-hf",
-    "ai21labs/Jamba-tiny-dev",
-    "pfnet/plamo-2-1b",
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
-    "Zyphra/Zamba2-1.2B-instruct",
-    "hmellor/tiny-random-BambaForCausalLM",
-    "ibm-granite/granite-4.0-tiny-preview",
-    "tiiuae/Falcon-H1-0.5B-Base",
-    "LiquidAI/LFM2-1.2B",
+    "tiny-random/qwen3-next-moe",
 ]

 FULL_CUDA_GRAPH_MODELS = [
@ -51,10 +42,6 @@ FULL_CUDA_GRAPH_MODELS = [
    "Zyphra/Zamba2-1.2B-instruct",
 ]

-V0_UNSUPPORTED_MODELS = [
-    "LiquidAI/LFM2-1.2B",
-]
-
 FP32_STATE_MODELS = [
    "state-spaces/mamba-130m-hf",
    "Zyphra/Zamba2-1.2B-instruct",
@ -88,20 +75,16 @@ def test_models(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    if model in V1_SUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v1_outputs = None
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

-    if model in V1_SUPPORTED_MODELS:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v1_outputs,
-            name_0="hf",
-            name_1="vllm-v1",
-        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@ -299,14 +282,14 @@ def test_full_cuda_graph(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v1_outputs,
+        outputs_1_lst=vllm_outputs,
        name_0="hf",
-        name_1="vllm-v1",
+        name_1="vllm",
    )


@ -340,12 +323,12 @@ def test_fp32_cache_state(
    with vllm_runner(model,
                     max_num_seqs=MAX_NUM_SEQS,
                     **{cache_dtype_param: "float32"}) as vllm_model:
-        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v1_outputs,
+        outputs_1_lst=vllm_outputs,
        name_0="hf",
-        name_1="vllm-v1",
+        name_1="vllm",
    )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -312,14 +312,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
-    "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501
-                                        trust_remote_code=True,
-                                        v0_only=True,
-                                        max_model_len=10240),
    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                         trust_remote_code=True),
    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                        trust_remote_code=True),
+                                         max_transformers_version="4.55.4",
+                                         transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+                                         trust_remote_code=True),
    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                       max_transformers_version="4.53",
                                       transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
@ -330,7 +328,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
    "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
-                                            min_transformers_version="4.56.2"),
+                                            extras={"tiny-random": "tiny-random/qwen3-next-moe"}, # noqa: E501
+                                            min_transformers_version="4.56.3"),
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
                                          trust_remote_code=True,
@ -644,7 +643,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                    trust_remote_code=True,
                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
    "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
-                                     min_transformers_version="4.56.2"),
+                                     min_transformers_version="4.56.3"),
 }

 _TRANSFORMERS_BACKEND_MODELS = {