From 0eb8f2b88004157da4cf7b41a72182a7c91d08a2 Mon Sep 17 00:00:00 2001
From: Lunwen He <lwhecser@gmail.com>
Date: Mon, 20 Oct 2025 19:04:14 -0700
Subject: [PATCH] create is_in_the_same_node on cpu (#26832)

Co-authored-by: Lunwen He <lunwenh@meta.com>
---
 .buildkite/test-amd.yaml            |  1 +
 .buildkite/test-pipeline.yaml       |  1 +
 tests/distributed/test_same_node.py | 28 +++++++++++++++++++++-------
 vllm/distributed/parallel_state.py  |  4 +++-
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 50b2b61124..a65f26d716 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1081,6 +1081,7 @@ steps:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - pytest -v -s distributed/test_sequence_parallel.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a28e333eac..3d8bbed56b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -977,6 +977,7 @@ steps:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - pytest -v -s distributed/test_sequence_parallel.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 8b7bd9fc40..4444327f01 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -3,12 +3,25 @@
 
 import os
 
+import torch
 import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.utils.network_utils import get_ip, get_open_port
 
+
+def _run_test(pg):
+    test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
+    if pg == dist.group.WORLD:
+        print("Same node test passed! when using torch distributed!")
+    else:
+        print("Same node test passed! when using StatelessProcessGroup!")
+
+
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
 
@@ -25,11 +38,12 @@ if __name__ == "__main__":
     stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
 
     for pg in [dist.group.WORLD, stateless_pg]:
-        test_result = all(in_the_same_node_as(pg, source_rank=0))
-
-        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-        assert test_result == expected, f"Expected {expected}, got {test_result}"
-        if pg == dist.group.WORLD:
-            print("Same node test passed! when using torch distributed!")
+        if os.environ.get("VLLM_TEST_WITH_DEFAULT_DEVICE_SET", "0") == "1":
+            default_devices = ["cpu"]
+            if torch.cuda.is_available():
+                default_devices.append("cuda")
+            for device in default_devices:
+                torch.set_default_device(device)
+                _run_test(pg)
         else:
-            print("Same node test passed! when using StatelessProcessGroup!")
+            _run_test(pg)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 38223c77d3..297af52ad7 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1526,7 +1526,9 @@ def in_the_same_node_as(
         ranks = list(range(world_size))
 
     # local tensor in each process to store the result
-    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+    is_in_the_same_node = torch.tensor(
+        [0] * world_size, dtype=torch.int32, device="cpu"
+    )
 
     magic_message = b"magic_message"
     shm = None