updated

Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
debug
2025-10-03 13:47:16 -04:00 · 2025-10-03 13:25:00 -04:00
4 changed files with 7 additions and 1 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -1050,6 +1050,7 @@ class DPEngineCoreProc(EngineCoreProc):
            self._maybe_publish_request_counts()
            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
            logger.info(f"{local_unfinished_reqs=}")
            if not executed:
                if not local_unfinished_reqs and not self.engines_running:
                    # All engines are idle.
@ -1057,6 +1058,7 @@ class DPEngineCoreProc(EngineCoreProc):
                # We are in a running state and so must execute a dummy pass
                # if the model didn't execute any ready requests.
                logger.info("===EXECUTE_DUMMY_BATCH===")
                self.execute_dummy_batch()
            # 3) All-reduce operation to determine global unfinished reqs.
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@ -1077,7 +1077,7 @@ class DPAsyncMPClient(AsyncMPClient):
                    if counts is not None:
                        sliced_counts = counts[count_slice]
                        self.lb_engines = sliced_counts
-                        logger.debug("Received counts: %s (%s)", sliced_counts,
+                        logger.debug("Received counts: %s (%s)", counts,
                                     count_slice)
        resources.stats_update_task = asyncio.create_task(
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # Run the model.
        # Use persistent buffers for CUDA graphs.
        logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
        with (set_forward_context(
                attn_metadata,
                self.vllm_config,
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                (1 token) and prefill (multiple tokens) requests.
            remove_lora: If False, dummy LoRAs are not destroyed after the run
        """
        logger.info("====== DUMMY RUN")
        assert cudagraph_runtime_mode is None or \
            cudagraph_runtime_mode.valid_runtime_modes()
--- a/vllm/v1/worker/ubatch_splitting.py
+++ b/vllm/v1/worker/ubatch_splitting.py
@ -167,6 +167,7 @@ def ubatch_split(
        num_tokens_unpadded,
        uniform_decode=uniform_decode,
    )
    logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
    # Don't microbatch unless every other DP worker is also microbatching
    should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
@ -175,6 +176,7 @@ def ubatch_split(
        should_attempt_ubatching,
        vllm_config,
    )
    logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
    if not should_ubatch:
        return (None, None)
Author	SHA1	Message	Date
Robert Shaw	6f62c94d7e	updated Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-10-03 13:47:16 -04:00
Robert Shaw	52a7d91980	debug Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>	2025-10-03 13:25:00 -04:00