[CI/Build] Reorganize root-level V1 tests (#25767)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-09-27 13:51:15 +08:00
committed by GitHub
parent b3613e3ace
commit cd87bfbf37
12 changed files with 75 additions and 81 deletions

View File

View File

@ -0,0 +1,157 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
from contextlib import ExitStack
from dataclasses import dataclass
from typing import Optional
import pytest
from vllm import SamplingParams
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import PromptType
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient
from vllm.v1.metrics.loggers import StatLoggerBase
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
DP_SIZE = int(os.getenv("DP_SIZE", 2))
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
enforce_eager=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
data_parallel_size=DP_SIZE,
)
async def generate(
engine: AsyncLLM,
request_id: str,
prompt: PromptType,
output_kind: RequestOutputKind,
max_tokens: int,
prompt_logprobs: Optional[int] = None,
data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
# Ensure generate doesn't complete too fast for cancellation test.
await asyncio.sleep(0.2)
count = 0
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True,
output_kind=output_kind,
temperature=0,
prompt_logprobs=prompt_logprobs)
async for out in engine.generate(request_id=request_id,
prompt=prompt,
sampling_params=sampling_params,
data_parallel_rank=data_parallel_rank):
num_tokens = len(out.outputs[0].token_ids)
if output_kind == RequestOutputKind.DELTA:
count += num_tokens
else:
count = num_tokens
await asyncio.sleep(0.)
return count, request_id
@pytest.mark.parametrize(
"output_kind",
[
RequestOutputKind.DELTA,
RequestOutputKind.FINAL_ONLY,
],
)
@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
@pytest.mark.parametrize("async_scheduling", [True, False])
@pytest.mark.asyncio
async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
async_scheduling: bool):
stats_loggers = {}
@dataclass
class SimpleStatsLogger(StatLoggerBase):
init_count: int = 0
finished_req_count: int = 0
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
stats_loggers[engine_index] = self
def record(self,
scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats],
engine_idx: int = 0):
if iteration_stats:
self.finished_req_count += len(
iteration_stats.finished_requests)
def log_engine_initialized(self):
self.init_count += 1
with ExitStack() as after:
prompt = "This is a test of data parallel"
engine_args.data_parallel_backend = data_parallel_backend
engine_args.async_scheduling = async_scheduling
engine = AsyncLLM.from_engine_args(engine_args,
stat_loggers=[SimpleStatsLogger])
after.callback(engine.shutdown)
NUM_REQUESTS = 100
NUM_EXPECTED_TOKENS = 10
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
# Create concurrent requests.
tasks = []
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, prompt, output_kind,
NUM_EXPECTED_TOKENS)))
# Short sleep to ensure that requests are distributed.
await asyncio.sleep(0.01)
# Confirm that we got all the EXPECTED tokens from the requests.
done, pending = await asyncio.wait(tasks,
return_when=asyncio.FIRST_EXCEPTION)
for task in pending:
task.cancel()
for task in done:
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
f"{request_id} generated {num_generated_tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert not engine.output_processor.has_unfinished_requests()
# testing internals here which may break
core_client: DPAsyncMPClient = engine.engine_core
# the engines only synchronize stopping every N steps so
# allow a small amount of time here.
for _ in range(10):
if not core_client.engines_running:
break
await asyncio.sleep(0.5)
assert not core_client.engines_running
assert not core_client.reqs_in_flight
# Check that requests were distributed between the engines
print(f"Stats loggers after test: {stats_loggers}")
assert len(stats_loggers) == DP_SIZE
assert stats_loggers[0].init_count == 1
for sl in stats_loggers.values():
slogger: SimpleStatsLogger = sl
assert slogger.finished_req_count > NUM_REQUESTS // (
DP_SIZE + 1), f"requests are imbalanced: {stats_loggers}"

View File

@ -0,0 +1,357 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import threading
import time
from contextlib import AsyncExitStack
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for external LB testing
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
# Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
class ExternalLBServerManager:
"""Manages data parallel vLLM server instances for external
load balancer testing."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
self.server_threads: list[threading.Thread] = []
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start all server instances for external LB mode."""
for rank in range(self.dp_size):
# Create server args for this specific rank
server_args = self.base_server_args.copy()
# Add external LB specific arguments
server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-rank",
str(rank),
"--data-parallel-size-local",
"1",
"--tensor-parallel-size",
str(self.tp_size),
"--port",
str(8000 + rank), # Different port for each rank
"--api-server-count",
str(self.api_server_count),
])
# Use a thread to start each server to allow parallel initialization
def start_server(r: int, sargs: list[str]):
try:
# Start the server
server = RemoteOpenAIServer(
self.model_name,
sargs,
auto_port=False,
env_dict={
"VLLM_SERVER_DEV_MODE":
"1",
current_platform.device_control_env_var:
",".join(
str(
current_platform.
device_id_to_physical_device_id(i))
for i in range(r * TP_SIZE, (r + 1) * TP_SIZE))
})
server.__enter__()
print(f"Server rank {r} started successfully with "
f"{self.api_server_count} API servers")
self.servers.append((server, sargs))
except Exception as e:
print(f"Failed to start server rank {r}: {e}")
raise
thread = threading.Thread(target=start_server,
args=(rank, server_args))
thread.start()
self.server_threads.append(thread)
# Wait for all servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(2)
if len(self.servers) != self.dp_size:
raise Exception("Servers failed to start")
return self.servers
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop all server instances."""
while self.servers:
try:
self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module", params=[1, 4])
def server_manager(request, default_server_args):
api_server_count = request.param
server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE,
api_server_count,
default_server_args)
with server_manager:
yield server_manager
@pytest.fixture
def servers(server_manager):
return server_manager.servers
@pytest_asyncio.fixture
async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
# Create a client for each server
async with AsyncExitStack() as stack:
yield [
await stack.enter_async_context(server.get_async_client())
for server, _ in servers
]
def _get_parallel_config(server: RemoteOpenAIServer):
response = requests.get(server.url_for("server_info?config_format=json"))
response.raise_for_status()
vllm_config = response.json()["vllm_config"]
return vllm_config["parallel_config"]
def test_external_lb_server_info(server_manager):
servers = server_manager.servers
api_server_count = server_manager.api_server_count
for i, (server, _) in enumerate(servers):
print(f"Testing {i=}")
# Each request will hit one of the API servers
# `n_reqs` is set so that there is a good chance each server
# receives at least one request
n_reqs = 2 * api_server_count * api_server_count
parallel_configs = [
_get_parallel_config(server) for _ in range(n_reqs)
]
api_process_counts = [
c["_api_process_count"] for c in parallel_configs
]
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
assert all(c == api_server_count
for c in api_process_counts), api_process_counts
assert all(0 <= r < api_server_count
for r in api_process_ranks), api_process_ranks
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_external_lb_single_completion(clients: list[
openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
model_name: str) -> None:
async def make_request(client: openai.AsyncOpenAI):
completion = await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=10,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request to each server
for i, client in enumerate(clients):
result = await make_request(client)
assert result is not None
print(f"Server {i} handled single completion request successfully")
await asyncio.sleep(0.5)
# Send requests to all servers in round-robin fashion
num_requests_per_server = 25 # Total 50 requests across 2 servers
all_tasks = []
for i, client in enumerate(clients):
tasks = [make_request(client) for _ in range(num_requests_per_server)]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_server * len(clients)
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = []
for i, client in enumerate(clients):
tasks = [make_request(client) for _ in range(num_requests_per_server)]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_server * len(clients)
assert all(completion is not None for completion in results)
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(
f"Successfully completed external LB test with {len(clients)} servers "
f"(API server count: {api_server_count})")
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_external_lb_completion_streaming(clients: list[
openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
model_name: str) -> None:
prompt = "What is an LLM?"
async def make_streaming_request(client: openai.AsyncOpenAI):
# Perform a non-streaming request to get the expected full output
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single request to each server
for i, client in enumerate(clients):
result = await make_streaming_request(client)
assert result is not None
print(f"Server {i} handled single streaming request successfully")
await asyncio.sleep(0.5)
# Send streaming requests to all servers in round-robin fashion
num_requests_per_server = 25 # Total 50 requests across 2 servers
all_tasks = []
for i, client in enumerate(clients):
tasks = [
make_streaming_request(client)
for _ in range(num_requests_per_server)
]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_server * len(clients)
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = []
for i, client in enumerate(clients):
tasks = [
make_streaming_request(client)
for _ in range(num_requests_per_server)
]
all_tasks.extend(tasks)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests_per_server * len(clients)
assert all(results), "Not all streaming requests completed successfully."
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed external LB streaming test with "
f"{len(clients)} servers (API server count: {api_server_count})")

View File

@ -0,0 +1,394 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import threading
import time
from contextlib import AsyncExitStack
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer
from tests.v1.utils import check_request_balancing
from vllm.platforms import current_platform
MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for hybrid LB testing (4 total)
DP_SIZE = int(os.getenv("DP_SIZE", "4"))
# Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
# Number of nodes (2 nodes, each with 2 DP ranks)
NUM_NODES = 2
DP_SIZE_LOCAL = DP_SIZE // NUM_NODES # 2 ranks per node
class HybridLBServerManager:
"""Manages hybrid data parallel vLLM server instances where each node
runs a single logical API server that balances requests only to the
DP engines running on that same node."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
dp_size_local: int = DP_SIZE_LOCAL,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.dp_size_local = dp_size_local
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
self.server_threads: list[threading.Thread] = []
self.num_nodes = dp_size // dp_size_local
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start all server instances for hybrid LB mode."""
for node_id in range(self.num_nodes):
# Create server args for this specific node
server_args = self.base_server_args.copy()
# Calculate start rank for this node
start_rank = node_id * self.dp_size_local
# Add hybrid LB specific arguments
server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_size_local),
"--data-parallel-start-rank",
str(start_rank),
"--data-parallel-hybrid-lb", # Enable hybrid LB mode
"--tensor-parallel-size",
str(self.tp_size),
"--port",
str(8000 + node_id), # Different port for each node
"--api-server-count",
str(self.api_server_count),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Use a thread to start each server to allow parallel initialization
def start_server(node: int, sargs: list[str]):
try:
# Calculate GPU devices for this node
gpus_per_node = self.dp_size_local * self.tp_size
gpu_start = node * gpus_per_node
gpu_end = gpu_start + gpus_per_node
# Start the server
server = RemoteOpenAIServer(
self.model_name,
sargs,
auto_port=False,
env_dict={
"VLLM_SERVER_DEV_MODE":
"1",
current_platform.device_control_env_var:
",".join(
str(
current_platform.
device_id_to_physical_device_id(i))
for i in range(gpu_start, gpu_end))
})
server.__enter__()
print(f"Hybrid LB node {node} started successfully with "
f"{self.dp_size_local} local DP ranks and "
f"{self.api_server_count} API servers")
self.servers.append((server, sargs))
except Exception as e:
print(f"Failed to start hybrid LB node {node}: {e}")
raise
thread = threading.Thread(target=start_server,
args=(node_id, server_args))
thread.start()
self.server_threads.append(thread)
# Wait for all servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(3)
if len(self.servers) != self.num_nodes:
raise Exception("Servers failed to start")
return self.servers
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop all server instances."""
while self.servers:
try:
self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module", params=[1, 4])
def server_manager(request, default_server_args):
api_server_count = request.param
server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE,
api_server_count,
default_server_args, DP_SIZE_LOCAL,
TP_SIZE)
with server_manager:
yield server_manager
@pytest.fixture
def servers(server_manager):
return server_manager.servers
@pytest_asyncio.fixture
async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
# Create a client for each node (each node has its own API endpoint)
async with AsyncExitStack() as stack:
yield [
await stack.enter_async_context(server.get_async_client())
for server, _ in servers
]
def _get_parallel_config(server: RemoteOpenAIServer):
response = requests.get(server.url_for("server_info?config_format=json"))
response.raise_for_status()
vllm_config = response.json()["vllm_config"]
return vllm_config["parallel_config"]
def test_hybrid_dp_server_info(server_manager):
servers = server_manager.servers
api_server_count = server_manager.api_server_count
for i, (server, _) in enumerate(servers):
print(f"Testing {i=}")
# Each request will hit one of the API servers
# `n_reqs` is set so that there is a good chance each server
# receives at least one request
n_reqs = 2 * api_server_count * api_server_count
parallel_configs = [
_get_parallel_config(server) for _ in range(n_reqs)
]
api_process_counts = [
c["_api_process_count"] for c in parallel_configs
]
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
assert all(c == api_server_count
for c in api_process_counts), api_process_counts
assert all(0 <= r < api_server_count
for r in api_process_ranks), api_process_ranks
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_hybrid_lb_completion(clients: list[openai.AsyncOpenAI],
servers: list[tuple[RemoteOpenAIServer,
list[str]]],
model_name: str) -> None:
async def make_request(client: openai.AsyncOpenAI):
completion = await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request to each node
for i, client in enumerate(clients):
result = await make_request(client)
assert result is not None
print(
f"Hybrid LB node {i} handled single completion request successfully"
)
await asyncio.sleep(0.5)
# Send requests to all nodes - each should balance within its local DP ranks
num_requests = 200 # Total 200 requests across 2 nodes
all_tasks = []
for i in range(num_requests):
client = clients[i % len(clients)]
all_tasks.append(asyncio.create_task(make_request(client)))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = []
for i in range(num_requests):
client = clients[i % len(clients)]
all_tasks.append(asyncio.create_task(make_request(client)))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(
f"Successfully completed hybrid LB test with {len(clients)} nodes "
f"({DP_SIZE_LOCAL} DP ranks each, API server count: {api_server_count})"
)
# Check request balancing within each node
for i, (server, _) in enumerate(servers):
print(f"Checking request balancing for node {i}")
check_request_balancing(server, DP_SIZE_LOCAL)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_hybrid_lb_completion_streaming(clients: list[
openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
model_name: str) -> None:
prompt = "What is an LLM?"
async def make_streaming_request(client: openai.AsyncOpenAI):
# Perform a non-streaming request to get the expected full output
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single request to each node
for i, client in enumerate(clients):
result = await make_streaming_request(client)
assert result is not None
print(
f"Hybrid LB node {i} handled single streaming request successfully"
)
await asyncio.sleep(0.5)
# Send streaming requests to all nodes
num_requests = 200 # Total 200 requests across 2 nodes
all_tasks = []
for i in range(num_requests):
client = clients[i % len(clients)]
all_tasks.append(asyncio.create_task(make_streaming_request(client)))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = []
for i in range(num_requests):
client = clients[i % len(clients)]
all_tasks.append(asyncio.create_task(make_streaming_request(client)))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed hybrid LB streaming test with "
f"{len(clients)} nodes ({DP_SIZE_LOCAL} DP ranks each, "
f"API server count: {api_server_count})")
# Check request balancing within each node
for i, (server, _) in enumerate(servers):
print(f"Checking streaming request balancing for node {i}")
check_request_balancing(server, DP_SIZE_LOCAL)

View File

@ -0,0 +1,718 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
import threading
import time
import traceback
from typing import Optional, cast
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer
from tests.v1.utils import check_request_balancing
from vllm.platforms import current_platform
MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for multi-node internal LB testing
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
# Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
# Number of nodes to simulate
NUM_NODES = 2
class MultinodeInternalLBServerManager:
"""Manages multi-node data parallel vLLM server instances for internal
load balancer testing using --headless mode."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
dp_per_node: int = 1,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.dp_per_node = dp_per_node
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[Optional[tuple[RemoteOpenAIServer,
list[str]]]] = [None] * (dp_size //
dp_per_node)
self.server_threads: list[threading.Thread] = []
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start all server instances for multi-node internal LB mode."""
for server_idx, rank in enumerate(
range(0, self.dp_size, self.dp_per_node)):
# Create server args for this specific rank
server_args = self.base_server_args.copy()
if rank == 0:
# Head node - runs API server and first DP rank
server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_per_node),
"--tensor-parallel-size",
str(self.tp_size),
"--port",
"8000", # Single endpoint for all requests
"--api-server-count",
str(self.api_server_count),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
else:
# Secondary nodes - run in headless mode
server_args.extend([
"--headless",
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_per_node),
"--data-parallel-start-rank",
str(rank),
"--tensor-parallel-size",
str(self.tp_size),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Use a thread to start each server to allow parallel initialization
def start_server(sidx: int, r: int, sargs: list[str]):
gpus_per_node = self.tp_size * self.dp_per_node
try:
# Start the server
server = RemoteOpenAIServer(
self.model_name,
sargs,
auto_port=False,
env_dict={
"VLLM_SERVER_DEV_MODE":
"1",
current_platform.device_control_env_var:
",".join(
str(
current_platform.
device_id_to_physical_device_id(i))
for i in range(r, r + gpus_per_node))
})
server.__enter__()
if r == 0:
print(
f"Head node (rank {r}) started successfully with "
f"{self.api_server_count} API servers")
else:
print(f"Headless node (rank {r}) started successfully")
self.servers[sidx] = (server, sargs)
except Exception as e:
print(f"Failed to start server rank {r}: {e}")
traceback.print_exc()
raise
thread = threading.Thread(target=start_server,
args=(server_idx, rank, server_args))
thread.start()
self.server_threads.append(thread)
# Wait for all servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(3)
if not all(self.servers):
raise Exception("Servers failed to start")
return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers)
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop all server instances."""
while self.servers:
if server := self.servers.pop():
try:
server[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
traceback.print_exc()
class APIOnlyServerManager:
"""Manages API-only server (Node 0) and headless engines server (Node 1)
for testing separated API server and engine configuration."""
def __init__(self,
model_name: str,
dp_size: int,
api_server_count: int,
base_server_args: list,
tp_size: int = TP_SIZE):
self.model_name = model_name
self.dp_size = dp_size
self.tp_size = tp_size
self.api_server_count = api_server_count
self.base_server_args = base_server_args
self.servers: list[Optional[tuple[RemoteOpenAIServer,
list[str]]]] = [None] * 2
self.server_threads: list[threading.Thread] = []
def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
"""Start API-only server and headless engines server."""
# Start API-only server (Node 0) - no engines, only API server
api_server_args = self.base_server_args.copy()
api_server_args.extend([
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
"0", # No engines on this node
"--tensor-parallel-size",
str(self.tp_size),
"--port",
"8000",
"--api-server-count",
str(self.api_server_count),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Start headless engines server (Node 1) - all engines, no API server
engines_server_args = self.base_server_args.copy()
engines_server_args.extend([
"--headless",
"--data-parallel-size",
str(self.dp_size),
"--data-parallel-size-local",
str(self.dp_size), # All engines on this node
"--tensor-parallel-size",
str(self.tp_size),
"--data-parallel-address",
"127.0.0.1",
"--data-parallel-rpc-port",
"13345",
])
# Use threads to start both servers in parallel
def start_api_server():
try:
server = RemoteOpenAIServer(
self.model_name,
api_server_args,
auto_port=False,
env_dict={
"VLLM_SERVER_DEV_MODE": "1",
# No GPUs needed for API-only server
})
server.__enter__()
print(f"API-only server started successfully with "
f"{self.api_server_count} API servers")
self.servers[0] = (server, api_server_args)
except Exception as e:
print(f"Failed to start API-only server: {e}")
raise
def start_engines_server():
try:
server = RemoteOpenAIServer(
self.model_name,
engines_server_args,
auto_port=False,
env_dict={
current_platform.device_control_env_var:
",".join(
str(
current_platform.
device_id_to_physical_device_id(i))
for i in range(self.dp_size * self.tp_size))
})
server.__enter__()
print(f"Headless engines server started successfully with "
f"{self.dp_size} engines")
self.servers[1] = (server, engines_server_args)
except Exception as e:
print(f"Failed to start headless engines server: {e}")
raise
# Start API server first
api_thread = threading.Thread(target=start_api_server)
api_thread.start()
self.server_threads.append(api_thread)
# Start engines server second
engines_thread = threading.Thread(target=start_engines_server)
engines_thread.start()
self.server_threads.append(engines_thread)
# Wait for both servers to start
for thread in self.server_threads:
thread.join()
# Give servers additional time to fully initialize and coordinate
time.sleep(3)
if not all(self.servers):
raise Exception("Both servers failed to start")
return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers)
def __exit__(self, exc_type, exc_val, exc_tb):
"""Stop both server instances."""
while self.servers:
if server := self.servers.pop():
try:
server[0].__exit__(exc_type, exc_val, exc_tb)
except Exception as e:
print(f"Error stopping server: {e}")
traceback.print_exc()
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module", params=[1, 4])
def server_manager(request, default_server_args):
api_server_count = request.param
server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
api_server_count,
default_server_args,
DP_SIZE // NUM_NODES,
TP_SIZE)
with server_manager:
yield server_manager
@pytest.fixture
def servers(server_manager):
return server_manager.servers
@pytest.fixture(scope="module", params=[1, 4])
def api_only_servers(request, default_server_args):
"""Fixture for API-only server + headless engines configuration."""
api_server_count = request.param
with APIOnlyServerManager(MODEL_NAME, DP_SIZE, api_server_count,
default_server_args, TP_SIZE) as server_list:
yield server_list
@pytest_asyncio.fixture
async def client(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
# For internal LB, we only connect to the head node (rank 0)
# which provides the single API endpoint
head_server = servers[0][0]
async with head_server.get_async_client() as client:
yield client
@pytest_asyncio.fixture
async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
list[str]]]):
"""Client fixture for API-only server configuration."""
# Connect to the API-only server (first server in the list)
api_server = api_only_servers[0][0]
async with api_server.get_async_client() as client:
yield client
def _get_parallel_config(server: RemoteOpenAIServer):
response = requests.get(server.url_for("server_info?config_format=json"))
response.raise_for_status()
vllm_config = response.json()["vllm_config"]
return vllm_config["parallel_config"]
def test_multinode_dp_server_info(server_manager):
head_server = server_manager.servers[0][0]
api_server_count = server_manager.api_server_count
# Each request will hit one of the API servers
# `n_reqs` is set so that there is a good chance each server
# receives at least one request
n_reqs = 2 * api_server_count * api_server_count
parallel_configs = [
_get_parallel_config(head_server) for _ in range(n_reqs)
]
api_process_counts = [c["_api_process_count"] for c in parallel_configs]
api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
assert all(c == api_server_count
for c in api_process_counts), api_process_counts
assert all(0 <= r < api_server_count
for r in api_process_ranks), api_process_ranks
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_multinode_dp_completion(client: openai.AsyncOpenAI,
servers: list[tuple[RemoteOpenAIServer,
list[str]]],
model_name: str) -> None:
async def make_request():
completion = await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request
result = await make_request()
assert result is not None
print(
"Multi-node internal LB handled single completion request successfully"
)
await asyncio.sleep(0.5)
# Send multiple requests - internal LB should distribute across DP ranks
num_requests = 200
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed multi-node internal LB test with "
f"{len(servers)} DP ranks (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
head_server = servers[0][0]
check_request_balancing(head_server, DP_SIZE)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_multinode_dp_completion_streaming(client: openai.AsyncOpenAI,
servers: list[
tuple[RemoteOpenAIServer,
list[str]]],
model_name: str) -> None:
prompt = "What is an LLM?"
async def make_streaming_request():
# Perform a non-streaming request to get the expected full output
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single streaming request
result = await make_streaming_request()
assert result is not None
print(
"Multi-node internal LB handled single streaming request successfully")
await asyncio.sleep(0.5)
# Send multiple streaming requests - internal LB should distribute across
# DP ranks
num_requests = 200
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
_, server_args = servers[0]
api_server_count = (
server_args.count('--api-server-count')
and server_args[server_args.index('--api-server-count') + 1] or 1)
print(f"Successfully completed multi-node internal LB streaming test with "
f"{len(servers)} DP ranks (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
head_server = servers[0][0]
check_request_balancing(head_server, DP_SIZE)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_api_only_multinode_dp_completion(
api_only_client: openai.AsyncOpenAI,
api_only_servers: list[tuple[RemoteOpenAIServer,
list[str]]], model_name: str) -> None:
"""Test API-only server with all engines on separate headless server."""
async def make_request():
completion = await api_only_client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=1.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert len(choice.text) >= 1
# Finish reason might not always be 'length' if the model finishes
# early or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert choice.finish_reason in ("length", "stop")
# Token counts can also vary, so we check they are positive.
assert completion.usage.completion_tokens > 0
assert completion.usage.prompt_tokens > 0
assert completion.usage.total_tokens > 0
return completion
# Test single request
result = await make_request()
assert result is not None
print("API-only server handled single completion request successfully")
await asyncio.sleep(0.5)
# Send multiple requests - should be distributed across engines on
# headless server
num_requests = 200
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
await asyncio.sleep(0.5)
# Second burst of requests
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(completion is not None for completion in results)
api_server, api_server_args = api_only_servers[0]
api_server_count = (
api_server_args.count('--api-server-count')
and api_server_args[api_server_args.index('--api-server-count') + 1]
or 1)
print(f"Successfully completed API-only multi-node test with {DP_SIZE} "
f"engines on headless server (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
check_request_balancing(api_server, DP_SIZE)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_api_only_multinode_dp_completion_streaming(
api_only_client: openai.AsyncOpenAI,
api_only_servers: list[tuple[RemoteOpenAIServer,
list[str]]], model_name: str) -> None:
"""Test API-only server streaming with all engines on separate
headless server."""
prompt = "What is an LLM?"
async def make_streaming_request():
# Perform a non-streaming request to get the expected full output
single_completion = await api_only_client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
# Perform the streaming request
stream = await api_only_client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: list[str] = []
finish_reason_count = 0
last_chunk = None
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
last_chunk = chunk # Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert finish_reason_count == 1, (
"Finish reason should appear exactly once.")
assert last_chunk is not None, (
"Stream should have yielded at least one chunk.")
assert last_chunk.choices[
0].finish_reason == "length", "Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert "".join(
chunks
) == single_output, "Streamed output should match non-streamed output."
return True # Indicate success for this request
# Test single streaming request
result = await make_streaming_request()
assert result is not None
print("API-only server handled single streaming request successfully")
await asyncio.sleep(0.5)
# Send multiple streaming requests - should be distributed across engines
num_requests = 200
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
await asyncio.sleep(0.5)
# Second burst of streaming requests
all_tasks = []
for _ in range(num_requests):
all_tasks.append(asyncio.create_task(make_streaming_request()))
await asyncio.sleep(0.01)
results = await asyncio.gather(*all_tasks)
assert len(results) == num_requests
assert all(results), "Not all streaming requests completed successfully."
_, api_server_args = api_only_servers[0]
api_server_count = (
api_server_args.count('--api-server-count')
and api_server_args[api_server_args.index('--api-server-count') + 1]
or 1)
print(f"Successfully completed API-only streaming test with {DP_SIZE} "
f"engines on headless server (API server count: {api_server_count})")
# Check request balancing via Prometheus metrics
api_server = api_only_servers[0][0]
check_request_balancing(api_server, DP_SIZE)