Compare commits

...

2 Commits

Author SHA1 Message Date
6f62c94d7e updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-03 13:47:16 -04:00
52a7d91980 debug
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-03 13:25:00 -04:00
4 changed files with 7 additions and 1 deletions

View File

@ -1050,6 +1050,7 @@ class DPEngineCoreProc(EngineCoreProc):
self._maybe_publish_request_counts() self._maybe_publish_request_counts()
local_unfinished_reqs = self.scheduler.has_unfinished_requests() local_unfinished_reqs = self.scheduler.has_unfinished_requests()
logger.info(f"{local_unfinished_reqs=}")
if not executed: if not executed:
if not local_unfinished_reqs and not self.engines_running: if not local_unfinished_reqs and not self.engines_running:
# All engines are idle. # All engines are idle.
@ -1057,6 +1058,7 @@ class DPEngineCoreProc(EngineCoreProc):
# We are in a running state and so must execute a dummy pass # We are in a running state and so must execute a dummy pass
# if the model didn't execute any ready requests. # if the model didn't execute any ready requests.
logger.info("===EXECUTE_DUMMY_BATCH===")
self.execute_dummy_batch() self.execute_dummy_batch()
# 3) All-reduce operation to determine global unfinished reqs. # 3) All-reduce operation to determine global unfinished reqs.

View File

@ -1077,7 +1077,7 @@ class DPAsyncMPClient(AsyncMPClient):
if counts is not None: if counts is not None:
sliced_counts = counts[count_slice] sliced_counts = counts[count_slice]
self.lb_engines = sliced_counts self.lb_engines = sliced_counts
logger.debug("Received counts: %s (%s)", sliced_counts, logger.debug("Received counts: %s (%s)", counts,
count_slice) count_slice)
resources.stats_update_task = asyncio.create_task( resources.stats_update_task = asyncio.create_task(

View File

@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Run the model. # Run the model.
# Use persistent buffers for CUDA graphs. # Use persistent buffers for CUDA graphs.
logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
with (set_forward_context( with (set_forward_context(
attn_metadata, attn_metadata,
self.vllm_config, self.vllm_config,
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
(1 token) and prefill (multiple tokens) requests. (1 token) and prefill (multiple tokens) requests.
remove_lora: If False, dummy LoRAs are not destroyed after the run remove_lora: If False, dummy LoRAs are not destroyed after the run
""" """
logger.info("====== DUMMY RUN")
assert cudagraph_runtime_mode is None or \ assert cudagraph_runtime_mode is None or \
cudagraph_runtime_mode.valid_runtime_modes() cudagraph_runtime_mode.valid_runtime_modes()

View File

@ -167,6 +167,7 @@ def ubatch_split(
num_tokens_unpadded, num_tokens_unpadded,
uniform_decode=uniform_decode, uniform_decode=uniform_decode,
) )
logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
# Don't microbatch unless every other DP worker is also microbatching # Don't microbatch unless every other DP worker is also microbatching
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch( should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
@ -175,6 +176,7 @@ def ubatch_split(
should_attempt_ubatching, should_attempt_ubatching,
vllm_config, vllm_config,
) )
logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
if not should_ubatch: if not should_ubatch:
return (None, None) return (None, None)