Compare commits
2 Commits
main
...
debug-logs
| Author | SHA1 | Date | |
|---|---|---|---|
| 6f62c94d7e | |||
| 52a7d91980 |
@ -1050,6 +1050,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
|||||||
self._maybe_publish_request_counts()
|
self._maybe_publish_request_counts()
|
||||||
|
|
||||||
local_unfinished_reqs = self.scheduler.has_unfinished_requests()
|
local_unfinished_reqs = self.scheduler.has_unfinished_requests()
|
||||||
|
logger.info(f"{local_unfinished_reqs=}")
|
||||||
if not executed:
|
if not executed:
|
||||||
if not local_unfinished_reqs and not self.engines_running:
|
if not local_unfinished_reqs and not self.engines_running:
|
||||||
# All engines are idle.
|
# All engines are idle.
|
||||||
@ -1057,6 +1058,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
|||||||
|
|
||||||
# We are in a running state and so must execute a dummy pass
|
# We are in a running state and so must execute a dummy pass
|
||||||
# if the model didn't execute any ready requests.
|
# if the model didn't execute any ready requests.
|
||||||
|
logger.info("===EXECUTE_DUMMY_BATCH===")
|
||||||
self.execute_dummy_batch()
|
self.execute_dummy_batch()
|
||||||
|
|
||||||
# 3) All-reduce operation to determine global unfinished reqs.
|
# 3) All-reduce operation to determine global unfinished reqs.
|
||||||
|
|||||||
@ -1077,7 +1077,7 @@ class DPAsyncMPClient(AsyncMPClient):
|
|||||||
if counts is not None:
|
if counts is not None:
|
||||||
sliced_counts = counts[count_slice]
|
sliced_counts = counts[count_slice]
|
||||||
self.lb_engines = sliced_counts
|
self.lb_engines = sliced_counts
|
||||||
logger.debug("Received counts: %s (%s)", sliced_counts,
|
logger.debug("Received counts: %s (%s)", counts,
|
||||||
count_slice)
|
count_slice)
|
||||||
|
|
||||||
resources.stats_update_task = asyncio.create_task(
|
resources.stats_update_task = asyncio.create_task(
|
||||||
|
|||||||
@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
# Run the model.
|
# Run the model.
|
||||||
# Use persistent buffers for CUDA graphs.
|
# Use persistent buffers for CUDA graphs.
|
||||||
|
logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
|
||||||
with (set_forward_context(
|
with (set_forward_context(
|
||||||
attn_metadata,
|
attn_metadata,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
(1 token) and prefill (multiple tokens) requests.
|
(1 token) and prefill (multiple tokens) requests.
|
||||||
remove_lora: If False, dummy LoRAs are not destroyed after the run
|
remove_lora: If False, dummy LoRAs are not destroyed after the run
|
||||||
"""
|
"""
|
||||||
|
logger.info("====== DUMMY RUN")
|
||||||
assert cudagraph_runtime_mode is None or \
|
assert cudagraph_runtime_mode is None or \
|
||||||
cudagraph_runtime_mode.valid_runtime_modes()
|
cudagraph_runtime_mode.valid_runtime_modes()
|
||||||
|
|
||||||
|
|||||||
@ -167,6 +167,7 @@ def ubatch_split(
|
|||||||
num_tokens_unpadded,
|
num_tokens_unpadded,
|
||||||
uniform_decode=uniform_decode,
|
uniform_decode=uniform_decode,
|
||||||
)
|
)
|
||||||
|
logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
|
||||||
|
|
||||||
# Don't microbatch unless every other DP worker is also microbatching
|
# Don't microbatch unless every other DP worker is also microbatching
|
||||||
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
|
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
|
||||||
@ -175,6 +176,7 @@ def ubatch_split(
|
|||||||
should_attempt_ubatching,
|
should_attempt_ubatching,
|
||||||
vllm_config,
|
vllm_config,
|
||||||
)
|
)
|
||||||
|
logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
|
||||||
|
|
||||||
if not should_ubatch:
|
if not should_ubatch:
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
|||||||
Reference in New Issue
Block a user