[CI] Add Decode Context Parallelism (DCP) test to CI (#24487)

Signed-off-by: Ming Yang <minos.future@gmail.com>
This commit is contained in:
Ming Yang
2025-09-16 06:21:28 -07:00
committed by GitHub
parent e4f0b4cd96
commit 4e5affeaa1
2 changed files with 21 additions and 7 deletions

View File

@ -946,7 +946,6 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
- label: LoRA TP Test (Distributed) # 17 min
timeout_in_minutes: 30
@ -1020,9 +1019,21 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: Qwen MoE EP Test # optional
##### H200 test #####
- label: Distrubted Tests (H200) # optional
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
##### B200 test #####
- label: Distributed Tests (B200) # optional
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py

View File

@ -71,12 +71,13 @@ class CPTestSettings:
parallel_setups = []
for eager_mode_val in [False]:
for pp_multiplier in [1]:
for dcp_multiplier in [2, 4]:
for dcp_multiplier in [0.5, 1]:
for chunked_prefill_val in [True]:
parallel_setups.append(
ParallelSetup(tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
dcp_size=dcp_multiplier * dcp_base,
dcp_size=int(dcp_multiplier *
tp_base),
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val))
return CPTestSettings(
@ -223,7 +224,9 @@ def _compare_cp_with_tp(
CP_TEXT_GENERATION_MODELS = {
# [MLA attention only]
"deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
"deepseek-ai/DeepSeek-V2-Lite-Chat":
[CPTestSettings.detailed(),
CPTestSettings.detailed(tp_base=2)],
}
CP_TEST_MODELS = [
@ -238,7 +241,7 @@ CP_TEST_MODELS = [
"runner", "test_options"),
[
params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id)
for setting in settings for params in setting.iter_params(model_id)
if model_id in CP_TEST_MODELS
],
)