Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/dbo-cudagraph-size

init
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-09-26 20:45:41 +00:00 · 2025-09-26 18:27:30 +00:00 · 2025-09-26 17:38:39 +00:00
2 changed files with 32 additions and 28 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -3477,8 +3477,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # We skip EPLB here since we don't want to record dummy metrics
        for num_tokens in compilation_cases:
            # We currently only capture ubatched graphs when its a FULL
-            # cudagraph and for uniform decode batches.
+            # cudagraph, a uniform decode batch, and the number of tokens
-            capture_ubatched_graph = self.parallel_config.enable_dbo \
+            # is above the threshold. Otherwise we just capture a non-ubatched
            # version of the graph
            allow_microbatching = self.parallel_config.enable_dbo \
                and cudagraph_runtime_mode == CUDAGraphMode.FULL \
                and uniform_decode \
                and check_ubatch_thresholds(
@ -3487,37 +3489,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    uniform_decode=uniform_decode,
                )
-            # Currently we capture both microbatched and non-microbatched
+            for _ in range(self.compilation_config.cudagraph_num_of_warmups):
-            # graphs when capture_ubatched_graph is True, this is because
+                # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
-            # occasionally we will be forced out of microbatching due to other
+                # But be careful, warm up with `NONE`is orthogonal to
-            # DP ranks not microbatching (usually caused by an empty second
+                # if we want to warm up attention or not. This is
-            # microbatch; once we resolve this, we can remove the
+                # different from the case where `FULL` implies capture
-            # non-microbatched graph capture).
+                # attention while `PIECEWISE` implies no attention.
-            allow_microbatching_options = [True, False] if \
+                force_attention = (
-                capture_ubatched_graph else [False]
+                    cudagraph_runtime_mode == CUDAGraphMode.FULL)
            for allow_microbatching in allow_microbatching_options:
                for _ in range(
                        self.compilation_config.cudagraph_num_of_warmups):
                    # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
                    # But be careful, warm up with `NONE`is orthogonal to
                    # if we want to warm up attention or not. This is
                    # different from the case where `FULL` implies capture
                    # attention while `PIECEWISE` implies no attention.
                    force_attention = (
                        cudagraph_runtime_mode == CUDAGraphMode.FULL)
                    self._dummy_run(num_tokens,
                                    cudagraph_runtime_mode=CUDAGraphMode.NONE,
                                    force_attention=force_attention,
                                    uniform_decode=uniform_decode,
                                    allow_microbatching=allow_microbatching,
                                    skip_eplb=True,
                                    remove_lora=False)
                self._dummy_run(num_tokens,
-                                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                cudagraph_runtime_mode=CUDAGraphMode.NONE,
                                force_attention=force_attention,
                                uniform_decode=uniform_decode,
                                allow_microbatching=allow_microbatching,
                                skip_eplb=True,
                                remove_lora=False)
            self._dummy_run(num_tokens,
                            cudagraph_runtime_mode=cudagraph_runtime_mode,
                            uniform_decode=uniform_decode,
                            allow_microbatching=allow_microbatching,
                            skip_eplb=True,
                            remove_lora=False)
        self.maybe_remove_all_loras(self.lora_config)
    def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@ -330,6 +330,18 @@ class UBatchWrapper:
        # If there's no ubatching, just run the runnable object
        if ubatch_slices is None:
            # This is to account for the case where ubatching was aborted.
            # When we capture full graphs we only capture one graph per shape,
            # meaning that if we have a ubatched  cudagraph for the current
            # num_tokens, we don't have a non-ubatched one. Without this
            # check, the cudagraph wrapper will try to capture a cudagraph
            # for this shape during a normal run.
            if cudagraph_runtime_mode is CUDAGraphMode.FULL:
                assert batch_descriptor is not None
                if batch_descriptor.num_tokens in self.cudagraphs:
                    cudagraph_runtime_mode = CUDAGraphMode.NONE
            if cudagraph_runtime_mode in (CUDAGraphMode.NONE,
                                          CUDAGraphMode.PIECEWISE):
                return self.runnable(*args, **kwargs)
Author	SHA1	Message	Date
Sage Moore	a4516dc190	Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/dbo-cudagraph-size	2025-09-26 20:45:41 +00:00
Sage Moore	2f29120a09	init Signed-off-by: Sage Moore <sage@neuralmagic.com>	2025-09-26 18:27:30 +00:00
Sage Moore	1fc9de4c1a	init Signed-off-by: Sage Moore <sage@neuralmagic.com>	2025-09-26 17:38:39 +00:00