[Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-07 20:56:12 -04:00
parent d77f7fb871
commit eaa2e51088
6 changed files with 52 additions and 8 deletions
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@ -95,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -327,7 +327,7 @@ def _test_toy_llama(*, use_inductor):
            num_piecewise_graphs_seen=0,
            num_piecewise_capturable_graphs_seen=0,
            num_backend_compilations=0,
-            num_cudagraph_caputured=0,
+            num_cudagraph_captured=0,
    ):
        outputs.append(
            run_model(llama_config, use_inductor=False, use_compile=False))
@ -343,7 +343,7 @@ def _test_toy_llama(*, use_inductor):
            num_piecewise_graphs_seen=1,
            num_piecewise_capturable_graphs_seen=1,
            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
            **kwargs,
    ):
@ -361,7 +361,7 @@ def _test_toy_llama(*, use_inductor):
            llama_config.num_layers,  # 1 + num_layers
            num_backend_compilations=1 +
            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=2 *
+            num_cudagraph_captured=2 *
        (1 + llama_config.num_layers
         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm
+from vllm.compilation.counter import compilation_counter
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+
+from .piecewise.test_simple import SillyModel
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1(monkeypatch):
+    """
+    TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
+    I'll switch them over later.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_use_cudagraphs(enabled):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=enabled,
+        cudagraph_capture_sizes=[100],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100, device="cuda")
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_cudagraph_captured=1 if enabled else 0,
+    ):
+        # first run is warmup
+        model(inputs)
+        # second run does CUDAGraphs recording (if enabled)
+        model(inputs)