[v1][torch.compile] support managing cudagraph buffer (#10203)

Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-11 11:10:27 -08:00
parent d7a4f2207b
commit 330e82d34a
4 changed files with 59 additions and 8 deletions
--- a/tests/compile/piecewise/piecewise_compilation_config.json
+++ b/tests/compile/piecewise/piecewise_compilation_config.json
@ -1,4 +1,5 @@
 {
    "use_cudagraph": true,
-    "non_cudagraph_ops": ["silly.attention"]
+    "non_cudagraph_ops": ["silly.attention"],
+    "cudagraph_copy_inputs": true
 }
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@ -80,7 +80,7 @@ def test_simple_piecewise_compile():
    config = os.path.join(directory, "piecewise_compilation_config.json")
    os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config

-    input_buffer = torch.randn(100).cuda()
+    inputs = torch.randn(100).cuda()

    with compilation_counter.expect(
            num_graphs_seen=1,  # one graph for the model
@ -92,15 +92,15 @@ def test_simple_piecewise_compile():
    ):

        with set_compile_context([1, 2]):
-            model(input_buffer)
+            model(inputs)

-            model(input_buffer[:2])
-            model(input_buffer[:1])
+            model(torch.randn(2).cuda())
+            model(torch.randn(1).cuda())

-        input_buffer[:2].zero_()
+        input = torch.zeros(2).cuda()
        global global_counter
        global_counter = 0
-        output = model(input_buffer[:2])
+        output = model(input)
        assert global_counter == 2
        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))