[FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)

Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-13 18:47:16 -04:00
parent 7200a21cd1
commit e3fdb627d9
8 changed files with 153 additions and 86 deletions
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -258,13 +258,13 @@ def tractable_computation(

@torch.inference_mode
 def run_model(
-    llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False
+    llama_config, use_compile: bool, backend: str, split_attn: bool = False
 ) -> torch.Tensor:
    if use_compile:
        compilation_config = CompilationConfig(
            level=CompilationLevel.PIECEWISE,
            use_cudagraph=True,
-            use_inductor=use_inductor,
+            backend=backend,
            cudagraph_capture_sizes=[1, 2],
        )
        if split_attn:
@ -338,8 +338,8 @@ def run_model(
            return output.cpu()


-@pytest.mark.parametrize("use_inductor", [True, False])
-def test_toy_llama(use_inductor: bool):
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
+def test_toy_llama(backend: str):
    # compare output with and without piecewise compilation

    llama_config = LlamaConfig(
@ -358,10 +358,10 @@ def test_toy_llama(use_inductor: bool):
        num_backend_compilations=0,
        num_cudagraph_captured=0,
    ):
-        outputs.append(run_model(llama_config, use_inductor=False, use_compile=False))
-    run_model(tractable_config, use_inductor=False, use_compile=False)
+        outputs.append(run_model(llama_config, backend="eager", use_compile=False))
+    run_model(tractable_config, backend="eager", use_compile=False)

-    if use_inductor:
+    if backend == "inductor":
        kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
    else:
        kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
@ -377,10 +377,8 @@ def test_toy_llama(use_inductor: bool):
        num_cudagraph_captured=2,
        **kwargs,
    ):
-        outputs.append(
-            run_model(llama_config, use_inductor=use_inductor, use_compile=True)
-        )
-    run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
+        outputs.append(run_model(llama_config, backend=backend, use_compile=True))
+    run_model(tractable_config, backend=backend, use_compile=True)

    with compilation_counter.expect(
        num_graphs_seen=1,  # one graph for the model
@ -395,16 +393,9 @@ def test_toy_llama(use_inductor: bool):
        ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):
        outputs.append(
-            run_model(
-                llama_config,
-                use_inductor=use_inductor,
-                use_compile=True,
-                split_attn=True,
-            )
+            run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
        )
-    run_model(
-        tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True
-    )
+    run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)

    for i in range(1, len(outputs)):
        assert torch.allclose(outputs[0], outputs[i])
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -77,14 +77,15 @@ class TestSetting:
            method="encode",
        ),
        # vision language model
-        TestSetting(
-            model="microsoft/Phi-3.5-vision-instruct",
-            model_args=["--trust-remote-code", "--max-model-len", "2048"],
-            pp_size=2,
-            tp_size=1,
-            attn_backend="FLASH_ATTN",
-            method="generate_with_image",
-        ),
+        # See https://github.com/vllm-project/vllm/issues/26716.
+        # TestSetting(
+        #     model="microsoft/Phi-3.5-vision-instruct",
+        #     model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        #     pp_size=2,
+        #     tp_size=1,
+        #     attn_backend="FLASH_ATTN",
+        #     method="generate_with_image",
+        # ),
    ],
 )
 def test_compile_correctness(
@ -109,41 +110,46 @@ def test_compile_correctness(
    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
        final_args = [
-            "--enforce-eager",
            *model_args,
            "-pp",
            str(pp_size),
            "-tp",
            str(tp_size),
+            "-O.cudagraph_mode=none",
        ]

        all_args: list[list[str]] = []
        all_envs: list[dict[str, str] | None] = []

-        for level in [
-            CompilationLevel.NO_COMPILATION,
+        for comp_level in [
+            CompilationLevel.DYNAMO_AS_IS,
+            CompilationLevel.DYNAMO_ONCE,
            CompilationLevel.PIECEWISE,
        ]:
-            all_args.append(final_args + [f"-O{level}"])
-            all_envs.append({})
+            for level in [CompilationLevel.NO_COMPILATION, comp_level]:
+                all_args.append(
+                    final_args + [f"-O.level={level}", "-O.backend=inductor"]
+                )

-        # inductor will change the output, so we only compare if the output
-        # is close, not exactly the same.
-        compare_all_settings(
-            model,
-            all_args,
-            all_envs,
-            method=method if method != "generate" else "generate_close",
-        )
-        all_envs.clear()
-        all_args.clear()
+            # inductor will change the output, so we only compare if the output
+            # is close, not exactly the same.
+            compare_all_settings(
+                model,
+                all_args,
+                all_envs,
+                method=method if method != "generate" else "generate_close",
+            )
+            all_envs.clear()
+            all_args.clear()

        for level in [
            CompilationLevel.NO_COMPILATION,
            CompilationLevel.DYNAMO_AS_IS,
            CompilationLevel.DYNAMO_ONCE,
+            CompilationLevel.PIECEWISE,
        ]:
-            all_args.append(final_args + [f"-O{level}"])
+            all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
+            all_envs.append({})
            all_envs.append({})

        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@ -36,55 +36,56 @@ class Relu3(ReLUSquaredActivation):


@pytest.mark.parametrize(
-    "env, torch_level, use_inductor, ops_enabled, default_on",
+    "env, torch_level, backend, ops_enabled, default_on",
    [
        # Default values based on compile level
        # - All by default (no Inductor compilation)
-        (None, 0, False, [True] * 4, True),
-        (None, 1, True, [True] * 4, True),
-        (None, 2, False, [True] * 4, True),
+        (None, 0, "eager", [True] * 4, True),
+        (None, 1, "eager", [True] * 4, True),
+        (None, 2, "eager", [True] * 4, True),
+        (None, 3, "eager", [True] * 4, True),
        # - None by default (with Inductor)
-        (None, 3, True, [False] * 4, False),
-        (None, 4, True, [False] * 4, False),
-        # - All by default (without Inductor)
-        (None, 3, False, [True] * 4, True),
-        (None, 4, False, [True] * 4, True),
+        (None, 0, "inductor", [True] * 4, True),
+        # - None by default (with Inductor)
+        (None, 1, "inductor", [False] * 4, False),
+        (None, 2, "inductor", [False] * 4, False),
+        (None, 3, "inductor", [False] * 4, False),
        # Explicitly enabling/disabling
        #
        # Default: all
        #
        # All but SiluAndMul
-        ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
+        ("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True),
        # Only ReLU3
-        ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
+        ("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False),
        # All but SiluAndMul
-        ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
+        ("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True),
        # All but ReLU3 (even if ReLU2 is on)
-        ("-relu3,+relu2", 3, False, [1, 1, 1, 0], True),
+        ("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True),
        # RMSNorm and SiluAndMul
-        ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
+        ("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False),
        # All but RMSNorm
-        ("-rms_norm", 3, False, [0, 1, 1, 1], True),
+        ("-rms_norm", 3, "eager", [0, 1, 1, 1], True),
        #
        # Default: none
        #
        # Only ReLU3
-        ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
+        ("none,+relu3", 3, "inductor", [0, 0, 0, 1], False),
        # All but RMSNorm
-        ("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
+        ("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True),
    ],
 )
 def test_enabled_ops(
    env: str | None,
    torch_level: int,
-    use_inductor: bool,
+    backend: str,
    ops_enabled: list[int],
    default_on: bool,
 ):
    custom_ops = env.split(",") if env else []
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            use_inductor=bool(use_inductor), level=torch_level, custom_ops=custom_ops
+            backend=backend, level=torch_level, custom_ops=custom_ops
        )
    )
    with set_current_vllm_config(vllm_config):