[CI] Add Decode Context Parallelism (DCP) test to CI (#24487)
Signed-off-by: Ming Yang <minos.future@gmail.com>
This commit is contained in:
@ -946,7 +946,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
|
||||
|
||||
- label: LoRA TP Test (Distributed) # 17 min
|
||||
timeout_in_minutes: 30
|
||||
@ -1020,9 +1019,21 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: Qwen MoE EP Test # optional
|
||||
##### H200 test #####
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
|
||||
@ -71,12 +71,13 @@ class CPTestSettings:
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False]:
|
||||
for pp_multiplier in [1]:
|
||||
for dcp_multiplier in [2, 4]:
|
||||
for dcp_multiplier in [0.5, 1]:
|
||||
for chunked_prefill_val in [True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
dcp_size=dcp_multiplier * dcp_base,
|
||||
dcp_size=int(dcp_multiplier *
|
||||
tp_base),
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val))
|
||||
return CPTestSettings(
|
||||
@ -223,7 +224,9 @@ def _compare_cp_with_tp(
|
||||
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
# [MLA attention only]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat":
|
||||
[CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2)],
|
||||
}
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
@ -238,7 +241,7 @@ CP_TEST_MODELS = [
|
||||
"runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
for setting in settings for params in setting.iter_params(model_id)
|
||||
if model_id in CP_TEST_MODELS
|
||||
],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user