Compare commits
3 Commits
use-uv-pyt
...
dbo-cudagr
| Author | SHA1 | Date | |
|---|---|---|---|
| a4516dc190 | |||
| 2f29120a09 | |||
| 1fc9de4c1a |
@ -3477,8 +3477,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# We skip EPLB here since we don't want to record dummy metrics
|
# We skip EPLB here since we don't want to record dummy metrics
|
||||||
for num_tokens in compilation_cases:
|
for num_tokens in compilation_cases:
|
||||||
# We currently only capture ubatched graphs when its a FULL
|
# We currently only capture ubatched graphs when its a FULL
|
||||||
# cudagraph and for uniform decode batches.
|
# cudagraph, a uniform decode batch, and the number of tokens
|
||||||
capture_ubatched_graph = self.parallel_config.enable_dbo \
|
# is above the threshold. Otherwise we just capture a non-ubatched
|
||||||
|
# version of the graph
|
||||||
|
allow_microbatching = self.parallel_config.enable_dbo \
|
||||||
and cudagraph_runtime_mode == CUDAGraphMode.FULL \
|
and cudagraph_runtime_mode == CUDAGraphMode.FULL \
|
||||||
and uniform_decode \
|
and uniform_decode \
|
||||||
and check_ubatch_thresholds(
|
and check_ubatch_thresholds(
|
||||||
@ -3487,37 +3489,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
uniform_decode=uniform_decode,
|
uniform_decode=uniform_decode,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Currently we capture both microbatched and non-microbatched
|
for _ in range(self.compilation_config.cudagraph_num_of_warmups):
|
||||||
# graphs when capture_ubatched_graph is True, this is because
|
# Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
|
||||||
# occasionally we will be forced out of microbatching due to other
|
# But be careful, warm up with `NONE`is orthogonal to
|
||||||
# DP ranks not microbatching (usually caused by an empty second
|
# if we want to warm up attention or not. This is
|
||||||
# microbatch; once we resolve this, we can remove the
|
# different from the case where `FULL` implies capture
|
||||||
# non-microbatched graph capture).
|
# attention while `PIECEWISE` implies no attention.
|
||||||
allow_microbatching_options = [True, False] if \
|
force_attention = (
|
||||||
capture_ubatched_graph else [False]
|
cudagraph_runtime_mode == CUDAGraphMode.FULL)
|
||||||
for allow_microbatching in allow_microbatching_options:
|
|
||||||
for _ in range(
|
|
||||||
self.compilation_config.cudagraph_num_of_warmups):
|
|
||||||
# Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
|
|
||||||
# But be careful, warm up with `NONE`is orthogonal to
|
|
||||||
# if we want to warm up attention or not. This is
|
|
||||||
# different from the case where `FULL` implies capture
|
|
||||||
# attention while `PIECEWISE` implies no attention.
|
|
||||||
force_attention = (
|
|
||||||
cudagraph_runtime_mode == CUDAGraphMode.FULL)
|
|
||||||
self._dummy_run(num_tokens,
|
|
||||||
cudagraph_runtime_mode=CUDAGraphMode.NONE,
|
|
||||||
force_attention=force_attention,
|
|
||||||
uniform_decode=uniform_decode,
|
|
||||||
allow_microbatching=allow_microbatching,
|
|
||||||
skip_eplb=True,
|
|
||||||
remove_lora=False)
|
|
||||||
self._dummy_run(num_tokens,
|
self._dummy_run(num_tokens,
|
||||||
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
cudagraph_runtime_mode=CUDAGraphMode.NONE,
|
||||||
|
force_attention=force_attention,
|
||||||
uniform_decode=uniform_decode,
|
uniform_decode=uniform_decode,
|
||||||
allow_microbatching=allow_microbatching,
|
allow_microbatching=allow_microbatching,
|
||||||
skip_eplb=True,
|
skip_eplb=True,
|
||||||
remove_lora=False)
|
remove_lora=False)
|
||||||
|
self._dummy_run(num_tokens,
|
||||||
|
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
||||||
|
uniform_decode=uniform_decode,
|
||||||
|
allow_microbatching=allow_microbatching,
|
||||||
|
skip_eplb=True,
|
||||||
|
remove_lora=False)
|
||||||
self.maybe_remove_all_loras(self.lora_config)
|
self.maybe_remove_all_loras(self.lora_config)
|
||||||
|
|
||||||
def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
|
def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
|
||||||
|
|||||||
@ -330,6 +330,18 @@ class UBatchWrapper:
|
|||||||
|
|
||||||
# If there's no ubatching, just run the runnable object
|
# If there's no ubatching, just run the runnable object
|
||||||
if ubatch_slices is None:
|
if ubatch_slices is None:
|
||||||
|
|
||||||
|
# This is to account for the case where ubatching was aborted.
|
||||||
|
# When we capture full graphs we only capture one graph per shape,
|
||||||
|
# meaning that if we have a ubatched cudagraph for the current
|
||||||
|
# num_tokens, we don't have a non-ubatched one. Without this
|
||||||
|
# check, the cudagraph wrapper will try to capture a cudagraph
|
||||||
|
# for this shape during a normal run.
|
||||||
|
if cudagraph_runtime_mode is CUDAGraphMode.FULL:
|
||||||
|
assert batch_descriptor is not None
|
||||||
|
if batch_descriptor.num_tokens in self.cudagraphs:
|
||||||
|
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||||
|
|
||||||
if cudagraph_runtime_mode in (CUDAGraphMode.NONE,
|
if cudagraph_runtime_mode in (CUDAGraphMode.NONE,
|
||||||
CUDAGraphMode.PIECEWISE):
|
CUDAGraphMode.PIECEWISE):
|
||||||
return self.runnable(*args, **kwargs)
|
return self.runnable(*args, **kwargs)
|
||||||
|
|||||||
Reference in New Issue
Block a user